diff --git a/README.md b/README.md index 4fe555609..c96503eb4 100644 --- a/README.md +++ b/README.md @@ -17,6 +17,7 @@ Currently working:\ -regions with different parameters, periodic/open boundary conditions, freezing spins\ -passing standard problems #1-#4\ -Qt GUI - almost every parameter can be adjusted from there\ +-Max system sizes supported: (2^14, 2^15, 2^15)\ \ Not working yet (compared to current Spirit release version):\ -Methods, other than LLG\ diff --git a/core/thirdparty/VkFFT/shaders/16384/vkFFT_single_c2c_16384.comp b/core/thirdparty/VkFFT/shaders/16384/vkFFT_single_c2c_16384.comp new file mode 100644 index 000000000..76f69e424 --- /dev/null +++ b/core/thirdparty/VkFFT/shaders/16384/vkFFT_single_c2c_16384.comp @@ -0,0 +1,610 @@ +#version 450 + +const float M_PI = 3.1415926535897932384626433832795; +const float M_SQRT1_2 = 0.70710678118654752440084436210485; + +layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;// 32, fft/8, 1: total <1024 +layout (constant_id = 4) const uint fft_dim = 2048; +layout (constant_id = 5) const bool inverse = false; +layout (constant_id = 6) const bool zeropad_0 = false; +layout (constant_id = 7) const bool zeropad_1 = false; +layout (constant_id = 8) const uint inputStride_0 = 1; +layout (constant_id = 9) const uint inputStride_1 = 1; +layout (constant_id = 10) const uint inputStride_2 = 1; +layout (constant_id = 11) const uint inputStride_3 = 1; +layout (constant_id = 12) const uint inputStride_4 = 1; +layout (constant_id = 13) const uint outputStride_0 = 1; +layout (constant_id = 14) const uint outputStride_1 = 1; +layout (constant_id = 15) const uint outputStride_2 = 1; +layout (constant_id = 16) const uint outputStride_3 = 1; +layout (constant_id = 17) const uint outputStride_4 = 1; +layout (constant_id = 18) const uint fft_dim_full = 2048; +layout (constant_id = 19) const uint stageStartSize = 2048; +layout (constant_id = 20) const uint fft_dim_x = 2048; +layout (constant_id = 21) const uint numStages = 1; +layout (constant_id = 22) const uint stageRadix_0 = 8; +layout (constant_id = 23) const uint stageRadix_1 = 8; +layout (constant_id = 24) const uint ratio_0 = 8; +layout (constant_id = 25) const uint ratio_1 = 8; +layout (constant_id = 26) const bool ratioDirection_0 = false; +layout (constant_id = 27) const bool ratioDirection_1 = true; +layout (constant_id = 28) const uint inputOffset = 0; +layout (constant_id = 29) const uint outputOffset = 0; +layout (constant_id = 30) const uint passID = 0; +const uint radixStride_0 = fft_dim/2; +const uint radixStride_1 = fft_dim/4; +const uint radixStride_2 = fft_dim/8; + +layout(push_constant) uniform PushConsts +{ + uint coordinate; + uint batchID; +} consts; + + +layout(std430, binding = 0) buffer Data { + vec2 inputs[]; +}; + +layout(std430, binding = 1) buffer Data2 { + vec2 outputs[]; +}; +uint indexInput(uint index) { + return inputOffset+index * inputStride_0 + gl_GlobalInvocationID.y * inputStride_1 + gl_GlobalInvocationID.z * inputStride_2 + consts.coordinate * inputStride_3 + consts.batchID * inputStride_4; +} +uint indexOutput(uint index) { + return outputOffset+index * outputStride_0 + gl_GlobalInvocationID.y * outputStride_1 + gl_GlobalInvocationID.z * outputStride_2 + consts.coordinate * outputStride_3 + consts.batchID * outputStride_4; +} +uint positionShuffle(uint pos, uint ratio, bool ratioDirection ) { + return (((pos)/(ratio))+((pos)%(ratio))*(fft_dim)); + +} +void radix2(inout vec2 values[2], vec2 w) { + vec2 temp; + temp.x=values[1].x*w.x-values[1].y*w.y; + temp.y=values[1].y*w.x+values[1].x*w.y; + values[1]=values[0]-temp; + values[0]=values[0]+temp; +} + +void radix4(inout vec2 values[4],inout vec2 w) { + + //DIF 1st stage with double angle + vec2 temp; + temp.x=values[2].x*w.x-values[2].y*w.y; + temp.y=values[2].y*w.x+values[2].x*w.y; + values[2]=values[0]-temp; + values[0]=values[0]+temp; + + temp.x=values[3].x*w.x-values[3].y*w.y; + temp.y=values[3].y*w.x+values[3].x*w.y; + values[3]=values[1]-temp; + values[1]=values[1]+temp; + + //DIF 2nd stage with half angle + w = normalize(w + vec2(1.0, 0.0)); + + temp.x=values[1].x*w.x-values[1].y*w.y; + temp.y=values[1].y*w.x+values[1].x*w.y; + values[1]=values[0]-temp; + values[0]=values[0]+temp; + + w = (inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); + + temp.x=values[3].x*w.x-values[3].y*w.y; + temp.y=values[3].y*w.x+values[3].x*w.y; + values[3]=values[2]-temp; + values[2]=values[2]+temp; + + temp = values[1]; + values[1]=values[2]; + values[2]=temp; +} + +void radix8(inout vec2 values[8], inout vec2 w) { + //DIF 1st stage with quadruple angle + + vec2 temp; + temp.x=values[4].x*w.x-values[4].y*w.y; + temp.y=values[4].y*w.x+values[4].x*w.y; + values[4]=values[0]-temp; + values[0]=values[0]+temp; + + temp.x=values[5].x*w.x-values[5].y*w.y; + temp.y=values[5].y*w.x+values[5].x*w.y; + values[5]=values[1]-temp; + values[1]=values[1]+temp; + + temp.x=values[6].x*w.x-values[6].y*w.y; + temp.y=values[6].y*w.x+values[6].x*w.y; + values[6]=values[2]-temp; + values[2]=values[2]+temp; + + temp.x=values[7].x*w.x-values[7].y*w.y; + temp.y=values[7].y*w.x+values[7].x*w.y; + values[7]=values[3]-temp; + values[3]=values[3]+temp; + + //DIF 2nd stage with double angle + w = normalize(w + vec2(1.0, 0.0)); + + temp.x=values[2].x*w.x-values[2].y*w.y; + temp.y=values[2].y*w.x+values[2].x*w.y; + values[2]=values[0]-temp; + values[0]=values[0]+temp; + + temp.x=values[3].x*w.x-values[3].y*w.y; + temp.y=values[3].y*w.x+values[3].x*w.y; + values[3]=values[1]-temp; + values[1]=values[1]+temp; + + vec2 iw = (inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); + + temp.x=values[6].x*iw.x-values[6].y*iw.y; + temp.y=values[6].y*iw.x+values[6].x*iw.y; + values[6]=values[4]-temp; + values[4]=values[4]+temp; + + temp.x=values[7].x*iw.x-values[7].y*iw.y; + temp.y=values[7].y*iw.x+values[7].x*iw.y; + values[7]=values[5]-temp; + values[5]=values[5]+temp; + + //DIF 3rd stage with angle + w = normalize(w + vec2(1.0, 0.0)); + + temp.x=values[1].x*w.x-values[1].y*w.y; + temp.y=values[1].y*w.x+values[1].x*w.y; + values[1]=values[0]-temp; + values[0]=values[0]+temp; + + iw = (inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); + + temp.x=values[3].x*iw.x-values[3].y*iw.y; + temp.y=values[3].y*iw.x+values[3].x*iw.y; + values[3]=values[2]-temp; + values[2]=values[2]+temp; + + iw.x=(inverse) ? w.x*M_SQRT1_2+w.y*M_SQRT1_2 : w.x*M_SQRT1_2-w.y*M_SQRT1_2; + iw.y=(inverse) ? w.y*M_SQRT1_2-w.x*M_SQRT1_2 : w.y*M_SQRT1_2+w.x*M_SQRT1_2; + + temp.x=values[5].x*iw.x-values[5].y*iw.y; + temp.y=values[5].y*iw.x+values[5].x*iw.y; + values[5]=values[4]-temp; + values[4]=values[4]+temp; + + w = (inverse) ? vec2(iw.y, -iw.x) : vec2(-iw.y, iw.x); + + temp.x=values[7].x*w.x-values[7].y*w.y; + temp.y=values[7].y*w.x+values[7].x*w.y; + values[7]=values[6]-temp; + values[6]=values[6]+temp; + + temp = values[1]; + values[1]=values[4]; + values[4]=temp; + + temp = values[3]; + values[3]=values[6]; + values[6]=temp; + +} +const uint max_shared_vec2=4096; +const uint ratio = 4;// reg mem/shared mem +const uint tempSize = fft_dim/gl_WorkGroupSize.x; +shared vec2 sdata[max_shared_vec2]; + +void main() { + + vec2 temp0[8]; + vec2 temp1[8]; + vec2 temp2[8]; + vec2 temp3[8]; + uint stageSize=1; + float stageAngle=(inverse) ? -M_PI : M_PI; + if (zeropad_0&&(!inverse)){ + for(uint i = 0; i < 4; ++i){ + temp0[i]=inputs[indexInput(gl_LocalInvocationID.x+(i)*gl_WorkGroupSize.x+(gl_WorkGroupID.x)*fft_dim)]; + temp1[i]=inputs[indexInput(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x+(gl_WorkGroupID.x)*fft_dim)]; + temp2[i]=inputs[indexInput(gl_LocalInvocationID.x+(i+16)*gl_WorkGroupSize.x+(gl_WorkGroupID.x)*fft_dim)]; + temp3[i]=inputs[indexInput(gl_LocalInvocationID.x+(i+24)*gl_WorkGroupSize.x+(gl_WorkGroupID.x)*fft_dim)]; + } + for(uint i = 4; i < 8; ++i){ + temp0[i]=vec2(0,0); + temp1[i]=vec2(0,0); + temp2[i]=vec2(0,0); + temp3[i]=vec2(0,0); + } + }else { + for(uint i = 0; i < 8; ++i){ + temp0[i]=inputs[indexInput(gl_LocalInvocationID.x+(i)*gl_WorkGroupSize.x+(gl_WorkGroupID.x)*fft_dim)]; + temp1[i]=inputs[indexInput(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x+(gl_WorkGroupID.x)*fft_dim)]; + temp2[i]=inputs[indexInput(gl_LocalInvocationID.x+(i+16)*gl_WorkGroupSize.x+(gl_WorkGroupID.x)*fft_dim)]; + temp3[i]=inputs[indexInput(gl_LocalInvocationID.x+(i+24)*gl_WorkGroupSize.x+(gl_WorkGroupID.x)*fft_dim)]; + } + } + + if ((passID>0)&&(!inverse)){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_WorkGroupID.x)*(gl_LocalInvocationID.x+(i)*gl_WorkGroupSize.x))/float(fft_dim_full)); + vec2 mult = vec2(cos(angle),sin(angle)); + vec2 res=vec2(temp0[i].x*mult.x-temp0[i].y*mult.y, temp0[i].y*mult.x+temp0[i].x*mult.y); + temp0[i]=res; + } + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_WorkGroupID.x)*(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x))/float(fft_dim_full)); + vec2 mult = vec2(cos(angle),sin(angle)); + vec2 res=vec2(temp1[i].x*mult.x-temp1[i].y*mult.y, temp1[i].y*mult.x+temp1[i].x*mult.y); + temp1[i]=res; + } + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_WorkGroupID.x)*(gl_LocalInvocationID.x+(i+16)*gl_WorkGroupSize.x))/float(fft_dim_full)); + vec2 mult = vec2(cos(angle),sin(angle)); + vec2 res=vec2(temp2[i].x*mult.x-temp2[i].y*mult.y, temp2[i].y*mult.x+temp2[i].x*mult.y); + temp2[i]=res; + } + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_WorkGroupID.x)*(gl_LocalInvocationID.x+(i+24)*gl_WorkGroupSize.x))/float(fft_dim_full)); + vec2 mult = vec2(cos(angle),sin(angle)); + vec2 res=vec2(temp3[i].x*mult.x-temp3[i].y*mult.y, temp3[i].y*mult.x+temp3[i].x*mult.y); + temp3[i]=res; + } + memoryBarrierShared(); + barrier(); + } + for (uint n=0; n < numStages-1; n++){//all stages but last are radix-8 + { + vec2 sort0[8]; + vec2 sort1[8]; + for(uint j = 0; j < 2; ++j){ + sort0[j]=temp0[2+4*j]; + sort0[j+2]=temp1[2+4*j]; + + } + for(uint j = 0; j < 2; ++j){ + sort1[j]=temp0[3+4*j]; + sort1[j+2]=temp1[3+4*j]; + + } + for(uint j = 0; j < 2; ++j) + sort0[j+6]=temp0[4*j+1]; + + for(uint j = 0; j < 2; ++j) + temp0[j]=temp0[4*j]; + for(uint j = 0; j < 2; ++j) + temp0[j+2]=temp1[4*j]; + for(uint j = 0; j < 2; ++j) + temp0[j+4]=temp2[4*j]; + for(uint j = 0; j < 2; ++j) + temp0[j+6]=temp3[4*j]; + + for(uint j = 0; j < 2; ++j) + temp1[j+2]=temp1[4*j+1]; + for(uint j = 0; j < 2; ++j) + temp1[j]=sort0[j+6]; + for(uint j = 0; j < 2; ++j) + temp1[j+4]=temp2[4*j+1]; + for(uint j = 0; j < 2; ++j) + temp1[j+6]=temp3[4*j+1]; + + for(uint j = 0; j < 2; ++j){ + sort0[j+4]=temp2[2+4*j]; + sort0[j+6]=temp3[2+4*j]; + } + for(uint j = 0; j < 2; ++j){ + sort1[j+4]=temp2[3+4*j]; + sort1[j+6]=temp3[3+4*j]; + } + + for (uint j=0; j<8; j++){ + temp2[j]=sort0[j]; + temp3[j]=sort1[j];} + + } + { + uint stageInvocationID = (gl_LocalInvocationID.x) & (stageSize - 1u); + float angle = stageInvocationID * stageAngle; + vec2 twiddleFactor = vec2(cos(angle), sin(angle)); + radix8(temp0, twiddleFactor); + } + { + uint stageInvocationID = (gl_LocalInvocationID.x+gl_WorkGroupSize.x) & (stageSize - 1u); + float angle = stageInvocationID * stageAngle; + vec2 twiddleFactor = vec2(cos(angle), sin(angle)); + radix8(temp1, twiddleFactor); + } + { + uint stageInvocationID = (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x) & (stageSize - 1u); + float angle = stageInvocationID * stageAngle; + vec2 twiddleFactor = vec2(cos(angle), sin(angle)); + radix8(temp2, twiddleFactor); + } + { + uint stageInvocationID = (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x) & (stageSize - 1u); + float angle = stageInvocationID * stageAngle; + vec2 twiddleFactor = vec2(cos(angle), sin(angle)); + radix8(temp3, twiddleFactor); + } + memoryBarrierShared(); + barrier(); + + //all stages but last have no shifts larger than shared memory size - no need for swap buffer. Need to serialize thread groups in ratio_over_max amount of batches and exchange data + { + float stageNormalization = (inverse) ? 0.125 : 1.0; + uint stageInvocationID = (gl_LocalInvocationID.x) & (stageSize - 1u); + uint blockInvocationID = (gl_LocalInvocationID.x) - stageInvocationID; + uint outputIndex = stageInvocationID + blockInvocationID * 8; + for(uint j = 0; j < 8; ++j){ + sdata[outputIndex+stageSize*j]=temp0[j]*stageNormalization; + } + memoryBarrierShared(); + barrier(); + + for (uint j=0; j<8; j++){ + temp0[j] = sdata[(gl_LocalInvocationID.x)+gl_WorkGroupSize.x*j]; + } + + memoryBarrierShared(); + barrier(); + + for(uint j = 0; j < 8; ++j){ + sdata[outputIndex+stageSize*j]=temp1[j]*stageNormalization; + } + memoryBarrierShared(); + barrier(); + + for (uint j=0; j<8; j++){ + temp1[j] = sdata[(gl_LocalInvocationID.x)+gl_WorkGroupSize.x*j]; + } + + memoryBarrierShared(); + barrier(); + + for(uint j = 0; j < 8; ++j){ + sdata[outputIndex+stageSize*j]=temp2[j]*stageNormalization; + } + memoryBarrierShared(); + barrier(); + + for (uint j=0; j<8; j++){ + temp2[j] = sdata[(gl_LocalInvocationID.x)+gl_WorkGroupSize.x*j]; + } + + memoryBarrierShared(); + barrier(); + for(uint j = 0; j < 8; ++j){ + sdata[outputIndex+stageSize*j]=temp3[j]*stageNormalization; + } + memoryBarrierShared(); + barrier(); + + for (uint j=0; j<8; j++){ + temp3[j] = sdata[(gl_LocalInvocationID.x)+gl_WorkGroupSize.x*j]; + } + + memoryBarrierShared(); + barrier(); + } + + stageSize=stageSize*8; + stageAngle=stageAngle*0.125f; + } + + + //last stage - arbitrary radix + //stageSize=4096; + { + vec2 sort0[8]; + vec2 sort1[8]; + for (uint t=0; t<2; t++){ + sort0[t*4]=temp0[t+4]; + sort0[t*4+1]=temp1[t+4]; + } + for (uint t=0; t<2; t++){ + sort1[t*4]=temp0[t+6]; + sort1[t*4+1]=temp1[t+6]; + } + for (uint t=0; t<2; t++) + sort0[t*4+2]=temp0[t+2]; + + for (uint t=0; t<2; t++) + temp0[t*4]=temp0[t]; + for (uint t=0; t<2; t++){ + temp0[t*4+1]=temp1[t]; + temp0[t*4+2]=temp2[t]; + temp0[t*4+3]=temp3[t]; + } + for (uint t=0; t<2; t++) + temp1[t*4+1]=temp1[t+2]; + for (uint t=0; t<2; t++){ + temp1[t*4]=sort0[t*4+2]; + temp1[t*4+2]=temp2[t+2]; + temp1[t*4+3]=temp3[t+2]; + } + + for (uint t=0; t<2; t++){ + sort0[t*4+2]=temp2[t+4]; + sort0[t*4+3]=temp3[t+4]; + } + for (uint t=0; t<2; t++){ + sort1[t*4+2]=temp2[t+6]; + sort1[t*4+3]=temp3[t+6]; + } + + for (uint t=0; t<8; t++){ + temp2[t]=sort0[t]; + temp3[t]=sort1[t]; + } + + } + + + for (uint i=0; i<2; i++){ + uint stageInvocationID = (gl_LocalInvocationID.x + i*gl_WorkGroupSize.x ) & (stageSize - 1u); + float angle = stageInvocationID * stageAngle; + vec2 twiddleFactor = vec2(cos(angle), sin(angle)); + + vec2 values[4]; + for(uint j = 0; j < 4; ++j){ + values[j] = temp0[i*4+j]; + } + radix4(values, twiddleFactor); + for(uint j = 0; j < 4; ++j){ + temp0[i*4+j]=values[j]; + } + + } + for (uint i=0; i<2; i++){ + uint stageInvocationID = (gl_LocalInvocationID.x + (i+2)*gl_WorkGroupSize.x ) & (stageSize - 1u); + float angle = stageInvocationID * stageAngle; + vec2 twiddleFactor = vec2(cos(angle), sin(angle)); + vec2 values[4]; + for(uint j = 0; j < 4; ++j){ + values[j] = temp1[i*4+j]; + } + radix4(values, twiddleFactor); + for(uint j = 0; j < 4; ++j){ + temp1[i*4+j]=values[j]; + } + + } + for (uint i=0; i<2; i++){ + uint stageInvocationID = (gl_LocalInvocationID.x + (i+4)*gl_WorkGroupSize.x ) & (stageSize - 1u); + float angle = stageInvocationID * stageAngle; + vec2 twiddleFactor = vec2(cos(angle), sin(angle)); + vec2 values[4]; + for(uint j = 0; j < 4; ++j){ + values[j] = temp2[i*4+j]; + } + radix4(values, twiddleFactor); + for(uint j = 0; j < 4; ++j){ + temp2[i*4+j]=values[j]; + } + + } + for (uint i=0; i<2; i++){ + uint stageInvocationID = (gl_LocalInvocationID.x + (i+6)*gl_WorkGroupSize.x ) & (stageSize - 1u); + float angle = stageInvocationID * stageAngle; + vec2 twiddleFactor = vec2(cos(angle), sin(angle)); + vec2 values[4]; + for(uint j = 0; j < 4; ++j){ + values[j] = temp3[i*4+j]; + } + radix4(values, twiddleFactor); + for(uint j = 0; j < 4; ++j){ + temp3[i*4+j]=values[j]; + } + + } + { + vec2 sort0[8]; + vec2 sort1[8]; + for(uint j = 0; j < 2; ++j){ + sort0[j]=temp0[2+4*j]; + sort0[j+2]=temp1[2+4*j]; + + } + for(uint j = 0; j < 2; ++j){ + sort1[j]=temp0[3+4*j]; + sort1[j+2]=temp1[3+4*j]; + + } + for(uint j = 0; j < 2; ++j) + sort0[j+6]=temp0[4*j+1]; + + for(uint j = 0; j < 2; ++j) + temp0[j]=temp0[4*j]; + for(uint j = 0; j < 2; ++j) + temp0[j+2]=temp1[4*j]; + for(uint j = 0; j < 2; ++j) + temp0[j+4]=temp2[4*j]; + for(uint j = 0; j < 2; ++j) + temp0[j+6]=temp3[4*j]; + + for(uint j = 0; j < 2; ++j) + temp1[j+2]=temp1[4*j+1]; + for(uint j = 0; j < 2; ++j) + temp1[j]=sort0[j+6]; + for(uint j = 0; j < 2; ++j) + temp1[j+4]=temp2[4*j+1]; + for(uint j = 0; j < 2; ++j) + temp1[j+6]=temp3[4*j+1]; + + for(uint j = 0; j < 2; ++j){ + sort0[j+4]=temp2[2+4*j]; + sort0[j+6]=temp3[2+4*j]; + } + for(uint j = 0; j < 2; ++j){ + sort1[j+4]=temp2[3+4*j]; + sort1[j+6]=temp3[3+4*j]; + } + + for (uint j=0; j<8; j++){ + temp2[j]=sort0[j]; + temp3[j]=sort1[j];} + + } + + memoryBarrierShared(); + barrier(); + if ((passID>0)&&(inverse)){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_WorkGroupID.x)*(gl_LocalInvocationID.x+(i)*gl_WorkGroupSize.x))/float(fft_dim_full)); + vec2 mult = vec2(cos(angle),-sin(angle)); + vec2 res=vec2(temp0[i].x*mult.x-temp0[i].y*mult.y, temp0[i].y*mult.x+temp0[i].x*mult.y); + temp0[i]=res; + } + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_WorkGroupID.x)*(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x))/float(fft_dim_full)); + vec2 mult = vec2(cos(angle),-sin(angle)); + vec2 res=vec2(temp1[i].x*mult.x-temp1[i].y*mult.y, temp1[i].y*mult.x+temp1[i].x*mult.y); + temp1[i]=res; + } + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_WorkGroupID.x)*(gl_LocalInvocationID.x+(i+16)*gl_WorkGroupSize.x))/float(fft_dim_full)); + vec2 mult = vec2(cos(angle),-sin(angle)); + vec2 res=vec2(temp2[i].x*mult.x-temp2[i].y*mult.y, temp2[i].y*mult.x+temp2[i].x*mult.y); + temp2[i]=res; + } + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_WorkGroupID.x)*(gl_LocalInvocationID.x+(i+24)*gl_WorkGroupSize.x))/float(fft_dim_full)); + vec2 mult = vec2(cos(angle),-sin(angle)); + vec2 res=vec2(temp3[i].x*mult.x-temp3[i].y*mult.y, temp3[i].y*mult.x+temp3[i].x*mult.y); + temp3[i]=res; + } + memoryBarrierShared(); + barrier(); + } + + if (zeropad_0&&(inverse)){ + float stageNormalization = (inverse) ? 0.25 : 1.0; + for (uint i=0; i<8; i++){ + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x+(gl_WorkGroupID.x)*fft_dim)]=temp0[i]*stageNormalization; + } + for (uint i=0; i<8; i++){ + outputs[indexOutput(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x+(gl_WorkGroupID.x)*fft_dim)]=temp1[i]*stageNormalization; + } + + } + else{ + + float stageNormalization = (inverse) ? 0.25 : 1.0; + + for (uint i=0; i<8; i++){ + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x+(gl_WorkGroupID.x)*fft_dim)]=temp0[i]*stageNormalization; + } + for (uint i=0; i<8; i++){ + outputs[indexOutput(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x+(gl_WorkGroupID.x)*fft_dim)]=temp1[i]*stageNormalization; + } + + for (uint i=0; i<8; i++){ + outputs[indexOutput(gl_LocalInvocationID.x+(i+16)*gl_WorkGroupSize.x+(gl_WorkGroupID.x)*fft_dim)]=temp2[i]*stageNormalization; + } + + for (uint i=0; i<8; i++){ + outputs[indexOutput(gl_LocalInvocationID.x+(i+24)*gl_WorkGroupSize.x+(gl_WorkGroupID.x)*fft_dim)]=temp3[i]*stageNormalization; + } + + } + +} diff --git a/core/thirdparty/VkFFT/shaders/16384/vkFFT_single_c2c_16384.spv b/core/thirdparty/VkFFT/shaders/16384/vkFFT_single_c2c_16384.spv new file mode 100644 index 000000000..ded98c545 Binary files /dev/null and b/core/thirdparty/VkFFT/shaders/16384/vkFFT_single_c2c_16384.spv differ diff --git a/core/thirdparty/VkFFT/shaders/16384/vkFFT_single_c2r_16384.comp b/core/thirdparty/VkFFT/shaders/16384/vkFFT_single_c2r_16384.comp new file mode 100644 index 000000000..d5b8eb93b --- /dev/null +++ b/core/thirdparty/VkFFT/shaders/16384/vkFFT_single_c2r_16384.comp @@ -0,0 +1,824 @@ +#version 450 + +const float M_PI = 3.1415926535897932384626433832795; +const float M_SQRT1_2 = 0.70710678118654752440084436210485; + +layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;// 32, fft/8, 1: total <1024 +layout (constant_id = 4) const uint fft_dim = 2048; +layout (constant_id = 5) const bool inverse = false; +layout (constant_id = 6) const bool zeropad_0 = false; +layout (constant_id = 7) const bool zeropad_1 = false; +layout (constant_id = 8) const uint inputStride_0 = 1; +layout (constant_id = 9) const uint inputStride_1 = 1; +layout (constant_id = 10) const uint inputStride_2 = 1; +layout (constant_id = 11) const uint inputStride_3 = 1; +layout (constant_id = 12) const uint inputStride_4 = 1; +layout (constant_id = 13) const uint outputStride_0 = 1; +layout (constant_id = 14) const uint outputStride_1 = 1; +layout (constant_id = 15) const uint outputStride_2 = 1; +layout (constant_id = 16) const uint outputStride_3 = 1; +layout (constant_id = 17) const uint outputStride_4 = 1; +layout (constant_id = 18) const uint fft_dim_full = 2048; +layout (constant_id = 19) const uint stageStartSize = 2048; +layout (constant_id = 20) const uint fft_dim_x = 2048; +layout (constant_id = 21) const uint numStages = 1; +layout (constant_id = 22) const uint stageRadix_0 = 8; +layout (constant_id = 23) const uint stageRadix_1 = 8; +layout (constant_id = 24) const uint ratio_0 = 8; +layout (constant_id = 25) const uint ratio_1 = 8; +layout (constant_id = 26) const bool ratioDirection_0 = false; +layout (constant_id = 27) const bool ratioDirection_1 = true; +layout (constant_id = 28) const uint inputOffset = 0; +layout (constant_id = 29) const uint outputOffset = 0; +layout (constant_id = 30) const uint passID = 0; +const uint radixStride_0 = fft_dim/2; +const uint radixStride_1 = fft_dim/4; +const uint radixStride_2 = fft_dim/8; + +layout(push_constant) uniform PushConsts +{ + uint coordinate; + uint batchID; +} consts; + +layout(std430, binding = 0) buffer Data { + vec2 inputs[]; +}; + +layout(std430, binding = 1) buffer Data2 { + float outputs[]; +}; + +uint indexInput(uint index_x, uint index_y) { + return inputOffset + index_x * inputStride_0 + index_y * inputStride_1 + gl_GlobalInvocationID.z * inputStride_2 + consts.coordinate * inputStride_3 + consts.batchID * inputStride_4 ; +} +uint indexOutput(uint index) { + return outputOffset + index * outputStride_0 + 2*gl_GlobalInvocationID.y* outputStride_1 + 2*gl_GlobalInvocationID.z * outputStride_2 + 2*consts.coordinate * outputStride_3+ 2*consts.batchID * outputStride_4; +} + +uint positionShuffle(uint pos, uint ratio, bool ratioDirection ) { + + return (((pos)/(ratio))+((pos)%(ratio))*(fft_dim/4)); + + /*if (ratioDirection) + return ((pos >> ratio)+(pos & (1<>1)); + else + return (((pos)/(fft_dim>>1))+((pos)%(fft_dim>>1))*(1<0) { + for (uint i=0; i<4; i++){ + temp0[i].x=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].x-sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].y); + temp0[i].y=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].y+sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].x); + temp3[i+4].x=(sdata[4*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].x+sdata[8*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].y); + temp3[i+4].y=(-sdata[4*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].y+sdata[8*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].x); + } + } else{ + for (uint i=1; i<4; i++){ + temp0[i].x=(sdata[i*gl_WorkGroupSize.x-1].x-sdata[(i+4)*gl_WorkGroupSize.x-1].y); + temp0[i].y=(sdata[i*gl_WorkGroupSize.x-1].y+sdata[(i+4)*gl_WorkGroupSize.x-1].x); + + } + for (uint i=0; i<4; i++){ + temp3[i+4].x=(sdata[(4 - i)*gl_WorkGroupSize.x - 1].x+sdata[(8-i)*gl_WorkGroupSize.x - 1].y); + temp3[i+4].y=(-sdata[(4 - i)*gl_WorkGroupSize.x - 1].y+sdata[(8-i)*gl_WorkGroupSize.x - 1].x); + } + temp0[4].x=(sdata[4*gl_WorkGroupSize.x-1].x-sdata[8*gl_WorkGroupSize.x-1].y); + temp0[4].y=(sdata[4*gl_WorkGroupSize.x-1].y+sdata[8*gl_WorkGroupSize.x-1].x); + + vec2 temp[2]; + if (zeropad_1){ + temp[0]=inputs[indexInput(2*gl_GlobalInvocationID.y, gl_WorkGroupSize.y*2*gl_NumWorkGroups.y)]; + temp[1]=inputs[indexInput(2*gl_GlobalInvocationID.y+1, gl_WorkGroupSize.y*2*gl_NumWorkGroups.y)]; + } else { + temp[0]=inputs[indexInput(2*gl_GlobalInvocationID.y, gl_WorkGroupSize.y*gl_NumWorkGroups.y)]; + temp[1]=inputs[indexInput(2*gl_GlobalInvocationID.y+1, gl_WorkGroupSize.y*gl_NumWorkGroups.y)]; + + } + temp0[0].x=(temp[0].x-temp[1].y); + temp0[0].y=(temp[0].y+temp[1].x); + } + memoryBarrierShared(); + barrier(); + for (uint i=0; i<8; i++){ + sdata[positionShuffle(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x, gl_GlobalInvocationID.y)]; + } + + memoryBarrierShared(); + barrier(); + + if (gl_LocalInvocationID.x>0){ + for (uint i=0; i<4; i++){ + temp0[i+4].x=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].x-sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].y); + temp0[i+4].y=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].y+sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].x); + temp3[i].x=(sdata[4*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].x+sdata[8*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].y); + temp3[i].y=(-sdata[4*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].y+sdata[8*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].x); + } + } else{ + for (uint i=1; i<4; i++){ + temp0[i+4].x=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].x-sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].y); + temp0[i+4].y=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].y+sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].x); + temp3[i].x=(sdata[(4 - i)*gl_WorkGroupSize.x - 1].x+sdata[(8-i)*gl_WorkGroupSize.x - 1].y); + temp3[i].y=(-sdata[(4 - i)*gl_WorkGroupSize.x - 1].y+sdata[(8-i)*gl_WorkGroupSize.x - 1].x); + } + temp3[0].x=(sdata[4*gl_WorkGroupSize.x - 1].x+sdata[8*gl_WorkGroupSize.x - 1].y); + temp3[0].y=(-sdata[4*gl_WorkGroupSize.x - 1].y+sdata[8*gl_WorkGroupSize.x - 1].x); + temp1[0].x=(sdata[4*gl_WorkGroupSize.x-1].x-sdata[8*gl_WorkGroupSize.x-1].y); + temp1[0].y=(sdata[4*gl_WorkGroupSize.x-1].y+sdata[8*gl_WorkGroupSize.x-1].x); + + } + memoryBarrierShared(); + barrier(); + for (uint i=0; i<8; i++){ + sdata[positionShuffle(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+(i+16)*gl_WorkGroupSize.x, gl_GlobalInvocationID.y)]; + } + + memoryBarrierShared(); + barrier(); + + if (gl_LocalInvocationID.x>0){ + for (uint i=0; i<4; i++){ + temp1[i].x=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].x-sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].y); + temp1[i].y=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].y+sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].x); + temp2[i+4].x=(sdata[4*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].x+sdata[8*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].y); + temp2[i+4].y=(-sdata[4*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].y+sdata[8*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].x); + } + } else{ + for (uint i=1; i<4; i++){ + temp1[i].x=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].x-sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].y); + temp1[i].y=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].y+sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].x); + temp2[i+4].x=(sdata[(4 - i)*gl_WorkGroupSize.x - 1].x+sdata[(8-i)*gl_WorkGroupSize.x - 1].y); + temp2[i+4].y=(-sdata[(4 - i)*gl_WorkGroupSize.x - 1].y+sdata[(8-i)*gl_WorkGroupSize.x - 1].x); + } + temp2[4].x=(sdata[4*gl_WorkGroupSize.x - 1].x+sdata[8*gl_WorkGroupSize.x - 1].y); + temp2[4].y=(-sdata[4*gl_WorkGroupSize.x - 1].y+sdata[8*gl_WorkGroupSize.x - 1].x); + temp1[4].x=(sdata[4*gl_WorkGroupSize.x-1].x-sdata[8*gl_WorkGroupSize.x-1].y); + temp1[4].y=(sdata[4*gl_WorkGroupSize.x-1].y+sdata[8*gl_WorkGroupSize.x-1].x); + } + + memoryBarrierShared(); + barrier(); + for (uint i=0; i<8; i++){ + sdata[positionShuffle(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+(i+24)*gl_WorkGroupSize.x, gl_GlobalInvocationID.y)]; + } + + memoryBarrierShared(); + barrier(); + if (gl_LocalInvocationID.x>0){ + for (uint i=0; i<4; i++){ + temp1[i+4].x=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].x-sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].y); + temp1[i+4].y=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].y+sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].x); + temp2[i].x=(sdata[4*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].x+sdata[8*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].y); + temp2[i].y=(-sdata[4*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].y+sdata[8*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].x); + } + } else{ + for (uint i=1; i<4; i++){ + temp1[i+4].x=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].x-sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].y); + temp1[i+4].y=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].y+sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].x); + temp2[i].x=(sdata[(4 - i)*gl_WorkGroupSize.x - 1].x+sdata[(8-i)*gl_WorkGroupSize.x - 1].y); + temp2[i].y=(-sdata[(4 - i)*gl_WorkGroupSize.x - 1].y+sdata[(8-i)*gl_WorkGroupSize.x - 1].x); + } + temp2[0].x=(sdata[4*gl_WorkGroupSize.x - 1].x+sdata[8*gl_WorkGroupSize.x - 1].y); + temp2[0].y=(-sdata[4*gl_WorkGroupSize.x - 1].y+sdata[8*gl_WorkGroupSize.x - 1].x); + } + memoryBarrierShared(); + barrier(); + + } + else{ + for (uint i=0; i<4; i++){ + sdata[(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, gl_GlobalInvocationID.y)]; + sdata[4*gl_WorkGroupSize.x + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+(i+16)*gl_WorkGroupSize.x, gl_GlobalInvocationID.y)]; + } + + memoryBarrierShared(); + barrier(); + + if (gl_LocalInvocationID.x>0) { + for (uint i=0; i<4; i++){ + temp0[i].x=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].x-sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].y); + temp0[i].y=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].y+sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].x); + temp3[i+4].x=(sdata[4*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].x+sdata[8*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].y); + temp3[i+4].y=(-sdata[4*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].y+sdata[8*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].x); + } + } else{ + for (uint i=1; i<4; i++){ + temp0[i].x=(sdata[i*gl_WorkGroupSize.x-1].x-sdata[(i+4)*gl_WorkGroupSize.x-1].y); + temp0[i].y=(sdata[i*gl_WorkGroupSize.x-1].y+sdata[(i+4)*gl_WorkGroupSize.x-1].x); + + } + for (uint i=0; i<4; i++){ + temp3[i+4].x=(sdata[(4 - i)*gl_WorkGroupSize.x - 1].x+sdata[(8-i)*gl_WorkGroupSize.x - 1].y); + temp3[i+4].y=(-sdata[(4 - i)*gl_WorkGroupSize.x - 1].y+sdata[(8-i)*gl_WorkGroupSize.x - 1].x); + } + temp0[4].x=(sdata[4*gl_WorkGroupSize.x-1].x-sdata[8*gl_WorkGroupSize.x-1].y); + temp0[4].y=(sdata[4*gl_WorkGroupSize.x-1].y+sdata[8*gl_WorkGroupSize.x-1].x); + + vec2 temp[2]; + if (zeropad_1){ + temp[0]=inputs[indexInput(2*gl_GlobalInvocationID.y, gl_WorkGroupSize.y*2*gl_NumWorkGroups.y)]; + temp[1]=inputs[indexInput(2*gl_GlobalInvocationID.y+1, gl_WorkGroupSize.y*2*gl_NumWorkGroups.y)]; + } else { + temp[0]=inputs[indexInput(2*gl_GlobalInvocationID.y, gl_WorkGroupSize.y*gl_NumWorkGroups.y)]; + temp[1]=inputs[indexInput(2*gl_GlobalInvocationID.y+1, gl_WorkGroupSize.y*gl_NumWorkGroups.y)]; + + } + temp0[0].x=(temp[0].x-temp[1].y); + temp0[0].y=(temp[0].y+temp[1].x); + } + memoryBarrierShared(); + barrier(); + for (uint i=0; i<4; i++){ + sdata[(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x, gl_GlobalInvocationID.y)]; + sdata[4*gl_WorkGroupSize.x + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+(i+20)*gl_WorkGroupSize.x, gl_GlobalInvocationID.y)]; + } + + memoryBarrierShared(); + barrier(); + + if (gl_LocalInvocationID.x>0){ + for (uint i=0; i<4; i++){ + temp0[i+4].x=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].x-sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].y); + temp0[i+4].y=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].y+sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].x); + temp3[i].x=(sdata[4*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].x+sdata[8*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].y); + temp3[i].y=(-sdata[4*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].y+sdata[8*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].x); + } + } else{ + for (uint i=1; i<4; i++){ + temp0[i+4].x=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].x-sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].y); + temp0[i+4].y=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].y+sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].x); + temp3[i].x=(sdata[(4 - i)*gl_WorkGroupSize.x - 1].x+sdata[(8-i)*gl_WorkGroupSize.x - 1].y); + temp3[i].y=(-sdata[(4 - i)*gl_WorkGroupSize.x - 1].y+sdata[(8-i)*gl_WorkGroupSize.x - 1].x); + } + temp3[0].x=(sdata[4*gl_WorkGroupSize.x - 1].x+sdata[8*gl_WorkGroupSize.x - 1].y); + temp3[0].y=(-sdata[4*gl_WorkGroupSize.x - 1].y+sdata[8*gl_WorkGroupSize.x - 1].x); + temp1[0].x=(sdata[4*gl_WorkGroupSize.x-1].x-sdata[8*gl_WorkGroupSize.x-1].y); + temp1[0].y=(sdata[4*gl_WorkGroupSize.x-1].y+sdata[8*gl_WorkGroupSize.x-1].x); + + } + memoryBarrierShared(); + barrier(); + for (uint i=0; i<4; i++){ + sdata[(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x, gl_GlobalInvocationID.y)]; + sdata[4*gl_WorkGroupSize.x + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+(i+24)*gl_WorkGroupSize.x, gl_GlobalInvocationID.y)]; + } + + memoryBarrierShared(); + barrier(); + + if (gl_LocalInvocationID.x>0){ + for (uint i=0; i<4; i++){ + temp1[i].x=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].x-sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].y); + temp1[i].y=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].y+sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].x); + temp2[i+4].x=(sdata[4*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].x+sdata[8*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].y); + temp2[i+4].y=(-sdata[4*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].y+sdata[8*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].x); + } + } else{ + for (uint i=1; i<4; i++){ + temp1[i].x=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].x-sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].y); + temp1[i].y=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].y+sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].x); + temp2[i+4].x=(sdata[(4 - i)*gl_WorkGroupSize.x - 1].x+sdata[(8-i)*gl_WorkGroupSize.x - 1].y); + temp2[i+4].y=(-sdata[(4 - i)*gl_WorkGroupSize.x - 1].y+sdata[(8-i)*gl_WorkGroupSize.x - 1].x); + } + temp2[4].x=(sdata[4*gl_WorkGroupSize.x - 1].x+sdata[8*gl_WorkGroupSize.x - 1].y); + temp2[4].y=(-sdata[4*gl_WorkGroupSize.x - 1].y+sdata[8*gl_WorkGroupSize.x - 1].x); + temp1[4].x=(sdata[4*gl_WorkGroupSize.x-1].x-sdata[8*gl_WorkGroupSize.x-1].y); + temp1[4].y=(sdata[4*gl_WorkGroupSize.x-1].y+sdata[8*gl_WorkGroupSize.x-1].x); + + } + memoryBarrierShared(); + barrier(); + for (uint i=0; i<4; i++){ + sdata[(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+(i+12)*gl_WorkGroupSize.x, gl_GlobalInvocationID.y)]; + sdata[4*gl_WorkGroupSize.x + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+(i+28)*gl_WorkGroupSize.x, gl_GlobalInvocationID.y)]; + } + + memoryBarrierShared(); + barrier(); + + if (gl_LocalInvocationID.x>0){ + for (uint i=0; i<4; i++){ + temp1[i+4].x=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].x-sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].y); + temp1[i+4].y=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].y+sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].x); + temp2[i].x=(sdata[4*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].x+sdata[8*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].y); + temp2[i].y=(-sdata[4*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].y+sdata[8*gl_WorkGroupSize.x - (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x) - 1].x); + } + } else{ + for (uint i=1; i<4; i++){ + temp1[i+4].x=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].x-sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].y); + temp1[i+4].y=(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1].y+sdata[gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1].x); + temp2[i].x=(sdata[(4 - i)*gl_WorkGroupSize.x - 1].x+sdata[(8-i)*gl_WorkGroupSize.x - 1].y); + temp2[i].y=(-sdata[(4 - i)*gl_WorkGroupSize.x - 1].y+sdata[(8-i)*gl_WorkGroupSize.x - 1].x); + } + temp2[0].x=(sdata[4*gl_WorkGroupSize.x - 1].x+sdata[8*gl_WorkGroupSize.x - 1].y); + temp2[0].y=(-sdata[4*gl_WorkGroupSize.x - 1].y+sdata[8*gl_WorkGroupSize.x - 1].x); + + } + } + + + memoryBarrierShared(); + barrier(); + + uint stageSize=1; + float stageAngle=(inverse) ? -M_PI : M_PI; + + for (uint n=0; n < numStages-1; n++){//all stages but last are radix-8 + { + vec2 sort0[8]; + vec2 sort1[8]; + for(uint j = 0; j < 2; ++j){ + sort0[j]=temp0[2+4*j]; + sort0[j+2]=temp1[2+4*j]; + + } + for(uint j = 0; j < 2; ++j){ + sort1[j]=temp0[3+4*j]; + sort1[j+2]=temp1[3+4*j]; + + } + for(uint j = 0; j < 2; ++j) + sort0[j+6]=temp0[4*j+1]; + + for(uint j = 0; j < 2; ++j) + temp0[j]=temp0[4*j]; + for(uint j = 0; j < 2; ++j) + temp0[j+2]=temp1[4*j]; + for(uint j = 0; j < 2; ++j) + temp0[j+4]=temp2[4*j]; + for(uint j = 0; j < 2; ++j) + temp0[j+6]=temp3[4*j]; + + for(uint j = 0; j < 2; ++j) + temp1[j+2]=temp1[4*j+1]; + for(uint j = 0; j < 2; ++j) + temp1[j]=sort0[j+6]; + for(uint j = 0; j < 2; ++j) + temp1[j+4]=temp2[4*j+1]; + for(uint j = 0; j < 2; ++j) + temp1[j+6]=temp3[4*j+1]; + + for(uint j = 0; j < 2; ++j){ + sort0[j+4]=temp2[2+4*j]; + sort0[j+6]=temp3[2+4*j]; + } + for(uint j = 0; j < 2; ++j){ + sort1[j+4]=temp2[3+4*j]; + sort1[j+6]=temp3[3+4*j]; + } + + for (uint j=0; j<8; j++){ + temp2[j]=sort0[j]; + temp3[j]=sort1[j];} + + } + { + uint stageInvocationID = (gl_LocalInvocationID.x) & (stageSize - 1u); + float angle = stageInvocationID * stageAngle; + vec2 twiddleFactor = vec2(cos(angle), sin(angle)); + radix8(temp0, twiddleFactor); + } + { + uint stageInvocationID = (gl_LocalInvocationID.x+gl_WorkGroupSize.x) & (stageSize - 1u); + float angle = stageInvocationID * stageAngle; + vec2 twiddleFactor = vec2(cos(angle), sin(angle)); + radix8(temp1, twiddleFactor); + } + { + uint stageInvocationID = (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x) & (stageSize - 1u); + float angle = stageInvocationID * stageAngle; + vec2 twiddleFactor = vec2(cos(angle), sin(angle)); + radix8(temp2, twiddleFactor); + } + { + uint stageInvocationID = (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x) & (stageSize - 1u); + float angle = stageInvocationID * stageAngle; + vec2 twiddleFactor = vec2(cos(angle), sin(angle)); + radix8(temp3, twiddleFactor); + } + memoryBarrierShared(); + barrier(); + + //all stages but last have no shifts larger than shared memory size - no need for swap buffer. Need to serialize thread groups in ratio_over_max amount of batches and exchange data + { + float stageNormalization = (inverse) ? 0.125 : 1.0; + uint stageInvocationID = (gl_LocalInvocationID.x) & (stageSize - 1u); + uint blockInvocationID = (gl_LocalInvocationID.x) - stageInvocationID; + uint outputIndex = stageInvocationID + blockInvocationID * 8; + for(uint j = 0; j < 8; ++j){ + sdata[outputIndex+stageSize*j]=temp0[j]*stageNormalization; + } + memoryBarrierShared(); + barrier(); + + for (uint j=0; j<8; j++){ + temp0[j] = sdata[(gl_LocalInvocationID.x)+gl_WorkGroupSize.x*j]; + } + + memoryBarrierShared(); + barrier(); + + for(uint j = 0; j < 8; ++j){ + sdata[outputIndex+stageSize*j]=temp1[j]*stageNormalization; + } + memoryBarrierShared(); + barrier(); + + for (uint j=0; j<8; j++){ + temp1[j] = sdata[(gl_LocalInvocationID.x)+gl_WorkGroupSize.x*j]; + } + + memoryBarrierShared(); + barrier(); + for(uint j = 0; j < 8; ++j){ + sdata[outputIndex+stageSize*j]=temp2[j]*stageNormalization; + } + memoryBarrierShared(); + barrier(); + + for (uint j=0; j<8; j++){ + temp2[j] = sdata[(gl_LocalInvocationID.x)+gl_WorkGroupSize.x*j]; + } + + memoryBarrierShared(); + barrier(); + for(uint j = 0; j < 8; ++j){ + sdata[outputIndex+stageSize*j]=temp3[j]*stageNormalization; + } + memoryBarrierShared(); + barrier(); + + for (uint j=0; j<8; j++){ + temp3[j] = sdata[(gl_LocalInvocationID.x)+gl_WorkGroupSize.x*j]; + } + + memoryBarrierShared(); + barrier(); + + } + + stageSize=stageSize*8; + stageAngle=stageAngle*0.125f; + } + + + //last stage - arbitrary radix + //stageSize=4096; + { + vec2 sort0[8]; + vec2 sort1[8]; + for (uint t=0; t<2; t++){ + sort0[t*4]=temp0[t+4]; + sort0[t*4+1]=temp1[t+4]; + } + for (uint t=0; t<2; t++){ + sort1[t*4]=temp0[t+6]; + sort1[t*4+1]=temp1[t+6]; + } + for (uint t=0; t<2; t++) + sort0[t*4+2]=temp0[t+2]; + + for (uint t=0; t<2; t++) + temp0[t*4]=temp0[t]; + for (uint t=0; t<2; t++){ + temp0[t*4+1]=temp1[t]; + temp0[t*4+2]=temp2[t]; + temp0[t*4+3]=temp3[t]; + } + for (uint t=0; t<2; t++) + temp1[t*4+1]=temp1[t+2]; + for (uint t=0; t<2; t++){ + temp1[t*4]=sort0[t*4+2]; + temp1[t*4+2]=temp2[t+2]; + temp1[t*4+3]=temp3[t+2]; + } + + for (uint t=0; t<2; t++){ + sort0[t*4+2]=temp2[t+4]; + sort0[t*4+3]=temp3[t+4]; + } + for (uint t=0; t<2; t++){ + sort1[t*4+2]=temp2[t+6]; + sort1[t*4+3]=temp3[t+6]; + } + + for (uint t=0; t<8; t++){ + temp2[t]=sort0[t]; + temp3[t]=sort1[t]; + } + + } + + + for (uint i=0; i<2; i++){ + uint stageInvocationID = (gl_LocalInvocationID.x + i*gl_WorkGroupSize.x ) & (stageSize - 1u); + float angle = stageInvocationID * stageAngle; + vec2 twiddleFactor = vec2(cos(angle), sin(angle)); + + vec2 values[4]; + for(uint j = 0; j < 4; ++j){ + values[j] = temp0[i*4+j]; + } + radix4(values, twiddleFactor); + for(uint j = 0; j < 4; ++j){ + temp0[i*4+j]=values[j]; + } + + } + for (uint i=0; i<2; i++){ + uint stageInvocationID = (gl_LocalInvocationID.x + (i+2)*gl_WorkGroupSize.x ) & (stageSize - 1u); + float angle = stageInvocationID * stageAngle; + vec2 twiddleFactor = vec2(cos(angle), sin(angle)); + vec2 values[4]; + for(uint j = 0; j < 4; ++j){ + values[j] = temp1[i*4+j]; + } + radix4(values, twiddleFactor); + for(uint j = 0; j < 4; ++j){ + temp1[i*4+j]=values[j]; + } + + } + for (uint i=0; i<2; i++){ + uint stageInvocationID = (gl_LocalInvocationID.x + (i+4)*gl_WorkGroupSize.x ) & (stageSize - 1u); + float angle = stageInvocationID * stageAngle; + vec2 twiddleFactor = vec2(cos(angle), sin(angle)); + vec2 values[4]; + for(uint j = 0; j < 4; ++j){ + values[j] = temp2[i*4+j]; + } + radix4(values, twiddleFactor); + for(uint j = 0; j < 4; ++j){ + temp2[i*4+j]=values[j]; + } + + } + for (uint i=0; i<2; i++){ + uint stageInvocationID = (gl_LocalInvocationID.x + (i+6)*gl_WorkGroupSize.x ) & (stageSize - 1u); + float angle = stageInvocationID * stageAngle; + vec2 twiddleFactor = vec2(cos(angle), sin(angle)); + vec2 values[4]; + for(uint j = 0; j < 4; ++j){ + values[j] = temp3[i*4+j]; + } + radix4(values, twiddleFactor); + for(uint j = 0; j < 4; ++j){ + temp3[i*4+j]=values[j]; + } + + } + { + vec2 sort0[8]; + vec2 sort1[8]; + for(uint j = 0; j < 2; ++j){ + sort0[j]=temp0[2+4*j]; + sort0[j+2]=temp1[2+4*j]; + + } + for(uint j = 0; j < 2; ++j){ + sort1[j]=temp0[3+4*j]; + sort1[j+2]=temp1[3+4*j]; + + } + for(uint j = 0; j < 2; ++j) + sort0[j+6]=temp0[4*j+1]; + + for(uint j = 0; j < 2; ++j) + temp0[j]=temp0[4*j]; + for(uint j = 0; j < 2; ++j) + temp0[j+2]=temp1[4*j]; + for(uint j = 0; j < 2; ++j) + temp0[j+4]=temp2[4*j]; + for(uint j = 0; j < 2; ++j) + temp0[j+6]=temp3[4*j]; + + for(uint j = 0; j < 2; ++j) + temp1[j+2]=temp1[4*j+1]; + for(uint j = 0; j < 2; ++j) + temp1[j]=sort0[j+6]; + for(uint j = 0; j < 2; ++j) + temp1[j+4]=temp2[4*j+1]; + for(uint j = 0; j < 2; ++j) + temp1[j+6]=temp3[4*j+1]; + + for(uint j = 0; j < 2; ++j){ + sort0[j+4]=temp2[2+4*j]; + sort0[j+6]=temp3[2+4*j]; + } + for(uint j = 0; j < 2; ++j){ + sort1[j+4]=temp2[3+4*j]; + sort1[j+6]=temp3[3+4*j]; + } + + for (uint j=0; j<8; j++){ + temp2[j]=sort0[j]; + temp3[j]=sort1[j];} + + } + + memoryBarrierShared(); + barrier(); + + if (zeropad_0){ + float stageNormalization = (inverse) ? 0.25 : 1.0; + + for (uint i=0; i<8; i++){ + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]=temp0[i].x*stageNormalization; + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)+ outputStride_1]=temp0[i].y*stageNormalization; + } + memoryBarrierShared(); + barrier(); + + for (uint i=0; i<8; i++){ + outputs[indexOutput(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x)]=temp1[i].x*stageNormalization; + outputs[indexOutput(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x)+ outputStride_1]=temp1[i].y*stageNormalization; + } + memoryBarrierShared(); + barrier(); + } + else{ + + float stageNormalization = (inverse) ? 0.25 : 1.0; + + for (uint i=0; i<8; i++){ + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]=temp0[i].x*stageNormalization; + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)+ outputStride_1]=temp0[i].y*stageNormalization; + } + memoryBarrierShared(); + barrier(); + + for (uint i=0; i<8; i++){ + outputs[indexOutput(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x)]=temp1[i].x*stageNormalization; + outputs[indexOutput(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x)+ outputStride_1]=temp1[i].y*stageNormalization; + } + memoryBarrierShared(); + barrier(); + + for (uint i=0; i<8; i++){ + outputs[indexOutput(gl_LocalInvocationID.x+(i+16)*gl_WorkGroupSize.x)]=temp2[i].x*stageNormalization; + outputs[indexOutput(gl_LocalInvocationID.x+(i+16)*gl_WorkGroupSize.x)+ outputStride_1]=temp2[i].y*stageNormalization; + } + memoryBarrierShared(); + barrier(); + + for (uint i=0; i<8; i++){ + outputs[indexOutput(gl_LocalInvocationID.x+(i+24)*gl_WorkGroupSize.x)]=temp3[i].x*stageNormalization; + outputs[indexOutput(gl_LocalInvocationID.x+(i+24)*gl_WorkGroupSize.x)+ outputStride_1]=temp3[i].y*stageNormalization; + } + memoryBarrierShared(); + barrier(); + } + +} diff --git a/core/thirdparty/VkFFT/shaders/16384/vkFFT_single_c2r_16384.spv b/core/thirdparty/VkFFT/shaders/16384/vkFFT_single_c2r_16384.spv new file mode 100644 index 000000000..980b9ea23 Binary files /dev/null and b/core/thirdparty/VkFFT/shaders/16384/vkFFT_single_c2r_16384.spv differ diff --git a/core/thirdparty/VkFFT/shaders/16384/vkFFT_single_r2c_16384.comp b/core/thirdparty/VkFFT/shaders/16384/vkFFT_single_r2c_16384.comp new file mode 100644 index 000000000..184b7e90b --- /dev/null +++ b/core/thirdparty/VkFFT/shaders/16384/vkFFT_single_r2c_16384.comp @@ -0,0 +1,806 @@ +#version 450 + +const float M_PI = 3.1415926535897932384626433832795; +const float M_SQRT1_2 = 0.70710678118654752440084436210485; + +layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;// 32, fft/8, 1: total <1024 +layout (constant_id = 4) const uint fft_dim = 2048; +layout (constant_id = 5) const bool inverse = false; +layout (constant_id = 6) const bool zeropad_0 = false; +layout (constant_id = 7) const bool zeropad_1 = false; +layout (constant_id = 8) const uint inputStride_0 = 1; +layout (constant_id = 9) const uint inputStride_1 = 1; +layout (constant_id = 10) const uint inputStride_2 = 1; +layout (constant_id = 11) const uint inputStride_3 = 1; +layout (constant_id = 12) const uint inputStride_4 = 1; +layout (constant_id = 13) const uint outputStride_0 = 1; +layout (constant_id = 14) const uint outputStride_1 = 1; +layout (constant_id = 15) const uint outputStride_2 = 1; +layout (constant_id = 16) const uint outputStride_3 = 1; +layout (constant_id = 17) const uint outputStride_4 = 1; +layout (constant_id = 18) const uint fft_dim_full = 2048; +layout (constant_id = 19) const uint stageStartSize = 2048; +layout (constant_id = 20) const uint fft_dim_x = 2048; +layout (constant_id = 21) const uint numStages = 1; +layout (constant_id = 22) const uint stageRadix_0 = 8; +layout (constant_id = 23) const uint stageRadix_1 = 8; +layout (constant_id = 24) const uint ratio_0 = 8; +layout (constant_id = 25) const uint ratio_1 = 8; +layout (constant_id = 26) const bool ratioDirection_0 = false; +layout (constant_id = 27) const bool ratioDirection_1 = true; +layout (constant_id = 28) const uint inputOffset = 0; +layout (constant_id = 29) const uint outputOffset = 0; +layout (constant_id = 30) const uint passID = 0; +const uint radixStride_0 = fft_dim/2; +const uint radixStride_1 = fft_dim/4; +const uint radixStride_2 = fft_dim/8; + +layout(push_constant) uniform PushConsts +{ + uint coordinate; + uint batchID; +} consts; + + +layout(std430, binding = 0) buffer Data { + float inputs[]; +}; + +layout(std430, binding = 1) buffer Data2 { + vec2 outputs[]; +}; +uint indexInput(uint index) { + return inputOffset+index * inputStride_0 + 2*gl_GlobalInvocationID.y * inputStride_1 + 2*gl_GlobalInvocationID.z * inputStride_2 + 2*consts.coordinate * inputStride_3 + 2*consts.batchID * inputStride_4 ; +} +uint indexOutput(uint index_x, uint index_y) { + return outputOffset+index_x * outputStride_0 + index_y * outputStride_1 + gl_GlobalInvocationID.z * outputStride_2 + consts.coordinate * outputStride_3+ consts.batchID * outputStride_4; +} +uint positionShuffle(uint pos, uint ratio, bool ratioDirection ) { + if (ratioDirection) + return (((pos)/(ratio))+((pos)%(ratio))*(fft_dim/4)); + else + return (((pos)/(fft_dim/4))+((pos)%(fft_dim/4))*(ratio)); + +} + +void radix2(inout vec2 values[2], vec2 w) { + vec2 temp; + temp.x=values[1].x*w.x-values[1].y*w.y; + temp.y=values[1].y*w.x+values[1].x*w.y; + values[1]=values[0]-temp; + values[0]=values[0]+temp; +} + +void radix4(inout vec2 values[4],inout vec2 w) { + + //DIF 1st stage with double angle + vec2 temp; + temp.x=values[2].x*w.x-values[2].y*w.y; + temp.y=values[2].y*w.x+values[2].x*w.y; + values[2]=values[0]-temp; + values[0]=values[0]+temp; + + temp.x=values[3].x*w.x-values[3].y*w.y; + temp.y=values[3].y*w.x+values[3].x*w.y; + values[3]=values[1]-temp; + values[1]=values[1]+temp; + + //DIF 2nd stage with half angle + w = normalize(w + vec2(1.0, 0.0)); + + temp.x=values[1].x*w.x-values[1].y*w.y; + temp.y=values[1].y*w.x+values[1].x*w.y; + values[1]=values[0]-temp; + values[0]=values[0]+temp; + + w = (inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); + + temp.x=values[3].x*w.x-values[3].y*w.y; + temp.y=values[3].y*w.x+values[3].x*w.y; + values[3]=values[2]-temp; + values[2]=values[2]+temp; + + temp = values[1]; + values[1]=values[2]; + values[2]=temp; +} + +void radix8(inout vec2 values[8], inout vec2 w) { + //DIF 1st stage with quadruple angle + + vec2 temp; + temp.x=values[4].x*w.x-values[4].y*w.y; + temp.y=values[4].y*w.x+values[4].x*w.y; + values[4]=values[0]-temp; + values[0]=values[0]+temp; + + temp.x=values[5].x*w.x-values[5].y*w.y; + temp.y=values[5].y*w.x+values[5].x*w.y; + values[5]=values[1]-temp; + values[1]=values[1]+temp; + + temp.x=values[6].x*w.x-values[6].y*w.y; + temp.y=values[6].y*w.x+values[6].x*w.y; + values[6]=values[2]-temp; + values[2]=values[2]+temp; + + temp.x=values[7].x*w.x-values[7].y*w.y; + temp.y=values[7].y*w.x+values[7].x*w.y; + values[7]=values[3]-temp; + values[3]=values[3]+temp; + + //DIF 2nd stage with double angle + w = normalize(w + vec2(1.0, 0.0)); + + temp.x=values[2].x*w.x-values[2].y*w.y; + temp.y=values[2].y*w.x+values[2].x*w.y; + values[2]=values[0]-temp; + values[0]=values[0]+temp; + + temp.x=values[3].x*w.x-values[3].y*w.y; + temp.y=values[3].y*w.x+values[3].x*w.y; + values[3]=values[1]-temp; + values[1]=values[1]+temp; + + vec2 iw = (inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); + + temp.x=values[6].x*iw.x-values[6].y*iw.y; + temp.y=values[6].y*iw.x+values[6].x*iw.y; + values[6]=values[4]-temp; + values[4]=values[4]+temp; + + temp.x=values[7].x*iw.x-values[7].y*iw.y; + temp.y=values[7].y*iw.x+values[7].x*iw.y; + values[7]=values[5]-temp; + values[5]=values[5]+temp; + + //DIF 3rd stage with angle + w = normalize(w + vec2(1.0, 0.0)); + + temp.x=values[1].x*w.x-values[1].y*w.y; + temp.y=values[1].y*w.x+values[1].x*w.y; + values[1]=values[0]-temp; + values[0]=values[0]+temp; + + iw = (inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); + + temp.x=values[3].x*iw.x-values[3].y*iw.y; + temp.y=values[3].y*iw.x+values[3].x*iw.y; + values[3]=values[2]-temp; + values[2]=values[2]+temp; + + iw.x=(inverse) ? w.x*M_SQRT1_2+w.y*M_SQRT1_2 : w.x*M_SQRT1_2-w.y*M_SQRT1_2; + iw.y=(inverse) ? w.y*M_SQRT1_2-w.x*M_SQRT1_2 : w.y*M_SQRT1_2+w.x*M_SQRT1_2; + + temp.x=values[5].x*iw.x-values[5].y*iw.y; + temp.y=values[5].y*iw.x+values[5].x*iw.y; + values[5]=values[4]-temp; + values[4]=values[4]+temp; + + w = (inverse) ? vec2(iw.y, -iw.x) : vec2(-iw.y, iw.x); + + temp.x=values[7].x*w.x-values[7].y*w.y; + temp.y=values[7].y*w.x+values[7].x*w.y; + values[7]=values[6]-temp; + values[6]=values[6]+temp; + + temp = values[1]; + values[1]=values[4]; + values[4]=temp; + + temp = values[3]; + values[3]=values[6]; + values[6]=temp; + +} + +const uint max_shared_vec2=4096; +const uint ratio_over_max = 4;// reg mem/shared mem +const uint tempSize = fft_dim/gl_WorkGroupSize.x; +shared vec2 sdata[max_shared_vec2];// half real half imag + + +void main() { + + vec2 temp0[8]; + vec2 temp1[8]; + vec2 temp2[8]; + vec2 temp3[8]; + uint stageSize=1; + float stageAngle=(inverse) ? -M_PI : M_PI; + if (zeropad_0){ + for(uint j = 0; j < 8; ++j){ + temp0[j].x=inputs[indexInput(gl_LocalInvocationID.x+(j)*gl_WorkGroupSize.x)]; + temp0[j].y=inputs[indexInput(gl_LocalInvocationID.x+(j)*gl_WorkGroupSize.x)+inputStride_1]; + temp1[j].x=inputs[indexInput(gl_LocalInvocationID.x+(j+8)*gl_WorkGroupSize.x)]; + temp1[j].y=inputs[indexInput(gl_LocalInvocationID.x+(j+8)*gl_WorkGroupSize.x)+inputStride_1]; + } + for(uint j = 0; j < 8; ++j){ + temp2[j]=vec2(0,0); + temp3[j]=vec2(0,0); + } + }else { + for(uint j = 0; j < 8; ++j){ + temp0[j].x=inputs[indexInput(gl_LocalInvocationID.x+(j)*gl_WorkGroupSize.x)]; + temp0[j].y=inputs[indexInput(gl_LocalInvocationID.x+(j)*gl_WorkGroupSize.x)+inputStride_1]; + temp1[j].x=inputs[indexInput(gl_LocalInvocationID.x+(j+8)*gl_WorkGroupSize.x)]; + temp1[j].y=inputs[indexInput(gl_LocalInvocationID.x+(j+8)*gl_WorkGroupSize.x)+inputStride_1]; + temp2[j].x=inputs[indexInput(gl_LocalInvocationID.x+(j+16)*gl_WorkGroupSize.x)]; + temp2[j].y=inputs[indexInput(gl_LocalInvocationID.x+(j+16)*gl_WorkGroupSize.x)+inputStride_1]; + temp3[j].x=inputs[indexInput(gl_LocalInvocationID.x+(j+24)*gl_WorkGroupSize.x)]; + temp3[j].y=inputs[indexInput(gl_LocalInvocationID.x+(j+24)*gl_WorkGroupSize.x)+inputStride_1]; + } + } + + for (uint n=0; n < numStages-1; n++){//all stages but last are radix-8 + { + vec2 sort0[8]; + vec2 sort1[8]; + for(uint j = 0; j < 2; ++j){ + sort0[j]=temp0[2+4*j]; + sort0[j+2]=temp1[2+4*j]; + + } + for(uint j = 0; j < 2; ++j){ + sort1[j]=temp0[3+4*j]; + sort1[j+2]=temp1[3+4*j]; + + } + for(uint j = 0; j < 2; ++j) + sort0[j+6]=temp0[4*j+1]; + + for(uint j = 0; j < 2; ++j) + temp0[j]=temp0[4*j]; + for(uint j = 0; j < 2; ++j) + temp0[j+2]=temp1[4*j]; + for(uint j = 0; j < 2; ++j) + temp0[j+4]=temp2[4*j]; + for(uint j = 0; j < 2; ++j) + temp0[j+6]=temp3[4*j]; + + for(uint j = 0; j < 2; ++j) + temp1[j+2]=temp1[4*j+1]; + for(uint j = 0; j < 2; ++j) + temp1[j]=sort0[j+6]; + for(uint j = 0; j < 2; ++j) + temp1[j+4]=temp2[4*j+1]; + for(uint j = 0; j < 2; ++j) + temp1[j+6]=temp3[4*j+1]; + + for(uint j = 0; j < 2; ++j){ + sort0[j+4]=temp2[2+4*j]; + sort0[j+6]=temp3[2+4*j]; + } + for(uint j = 0; j < 2; ++j){ + sort1[j+4]=temp2[3+4*j]; + sort1[j+6]=temp3[3+4*j]; + } + + for (uint j=0; j<8; j++){ + temp2[j]=sort0[j]; + temp3[j]=sort1[j];} + + } + { + uint stageInvocationID = (gl_LocalInvocationID.x) & (stageSize - 1u); + float angle = stageInvocationID * stageAngle; + vec2 twiddleFactor = vec2(cos(angle), sin(angle)); + radix8(temp0, twiddleFactor); + } + { + uint stageInvocationID = (gl_LocalInvocationID.x+gl_WorkGroupSize.x) & (stageSize - 1u); + float angle = stageInvocationID * stageAngle; + vec2 twiddleFactor = vec2(cos(angle), sin(angle)); + radix8(temp1, twiddleFactor); + } + { + uint stageInvocationID = (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x) & (stageSize - 1u); + float angle = stageInvocationID * stageAngle; + vec2 twiddleFactor = vec2(cos(angle), sin(angle)); + radix8(temp2, twiddleFactor); + } + { + uint stageInvocationID = (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x) & (stageSize - 1u); + float angle = stageInvocationID * stageAngle; + vec2 twiddleFactor = vec2(cos(angle), sin(angle)); + radix8(temp3, twiddleFactor); + } + memoryBarrierShared(); + barrier(); + + //all stages but last have no shifts larger than shared memory size - no need for swap buffer. Need to serialize thread groups in ratio_over_max amount of batches and exchange data + { + float stageNormalization = (inverse) ? 0.125 : 1.0; + uint stageInvocationID = (gl_LocalInvocationID.x) & (stageSize - 1u); + uint blockInvocationID = (gl_LocalInvocationID.x) - stageInvocationID; + uint outputIndex = stageInvocationID + blockInvocationID * 8; + for(uint j = 0; j < 8; ++j){ + sdata[outputIndex+stageSize*j]=temp0[j]*stageNormalization; + } + memoryBarrierShared(); + barrier(); + + for (uint j=0; j<8; j++){ + temp0[j] = sdata[(gl_LocalInvocationID.x)+gl_WorkGroupSize.x*j]; + } + + memoryBarrierShared(); + barrier(); + + for(uint j = 0; j < 8; ++j){ + sdata[outputIndex+stageSize*j]=temp1[j]*stageNormalization; + } + memoryBarrierShared(); + barrier(); + + for (uint j=0; j<8; j++){ + temp1[j] = sdata[(gl_LocalInvocationID.x)+gl_WorkGroupSize.x*j]; + } + + memoryBarrierShared(); + barrier(); + for(uint j = 0; j < 8; ++j){ + sdata[outputIndex+stageSize*j]=temp2[j]*stageNormalization; + } + memoryBarrierShared(); + barrier(); + + for (uint j=0; j<8; j++){ + temp2[j] = sdata[(gl_LocalInvocationID.x)+gl_WorkGroupSize.x*j]; + } + + memoryBarrierShared(); + barrier(); + for(uint j = 0; j < 8; ++j){ + sdata[outputIndex+stageSize*j]=temp3[j]*stageNormalization; + } + memoryBarrierShared(); + barrier(); + + for (uint j=0; j<8; j++){ + temp3[j] = sdata[(gl_LocalInvocationID.x)+gl_WorkGroupSize.x*j]; + } + + memoryBarrierShared(); + barrier(); + + } + + stageSize=stageSize*8; + stageAngle=stageAngle*0.125f; + } + + + //last stage - arbitrary radix + //stageSize=4096; + { + vec2 sort0[8]; + vec2 sort1[8]; + for (uint t=0; t<2; t++){ + sort0[t*4]=temp0[t+4]; + sort0[t*4+1]=temp1[t+4]; + } + for (uint t=0; t<2; t++){ + sort1[t*4]=temp0[t+6]; + sort1[t*4+1]=temp1[t+6]; + } + for (uint t=0; t<2; t++) + sort0[t*4+2]=temp0[t+2]; + + for (uint t=0; t<2; t++) + temp0[t*4]=temp0[t]; + for (uint t=0; t<2; t++){ + temp0[t*4+1]=temp1[t]; + temp0[t*4+2]=temp2[t]; + temp0[t*4+3]=temp3[t]; + } + for (uint t=0; t<2; t++) + temp1[t*4+1]=temp1[t+2]; + for (uint t=0; t<2; t++){ + temp1[t*4]=sort0[t*4+2]; + temp1[t*4+2]=temp2[t+2]; + temp1[t*4+3]=temp3[t+2]; + } + + for (uint t=0; t<2; t++){ + sort0[t*4+2]=temp2[t+4]; + sort0[t*4+3]=temp3[t+4]; + } + for (uint t=0; t<2; t++){ + sort1[t*4+2]=temp2[t+6]; + sort1[t*4+3]=temp3[t+6]; + } + + for (uint t=0; t<8; t++){ + temp2[t]=sort0[t]; + temp3[t]=sort1[t]; + } + + } + + + for (uint i=0; i<2; i++){ + uint stageInvocationID = (gl_LocalInvocationID.x + i*gl_WorkGroupSize.x ) & (stageSize - 1u); + float angle = stageInvocationID * stageAngle; + vec2 twiddleFactor = vec2(cos(angle), sin(angle)); + + vec2 values[4]; + for(uint j = 0; j < 4; ++j){ + values[j] = temp0[i*4+j]; + } + radix4(values, twiddleFactor); + for(uint j = 0; j < 4; ++j){ + temp0[i*4+j]=values[j]; + } + + } + for (uint i=0; i<2; i++){ + uint stageInvocationID = (gl_LocalInvocationID.x + (i+2)*gl_WorkGroupSize.x ) & (stageSize - 1u); + float angle = stageInvocationID * stageAngle; + vec2 twiddleFactor = vec2(cos(angle), sin(angle)); + vec2 values[4]; + for(uint j = 0; j < 4; ++j){ + values[j] = temp1[i*4+j]; + } + radix4(values, twiddleFactor); + for(uint j = 0; j < 4; ++j){ + temp1[i*4+j]=values[j]; + } + + } + for (uint i=0; i<2; i++){ + uint stageInvocationID = (gl_LocalInvocationID.x + (i+4)*gl_WorkGroupSize.x ) & (stageSize - 1u); + float angle = stageInvocationID * stageAngle; + vec2 twiddleFactor = vec2(cos(angle), sin(angle)); + vec2 values[4]; + for(uint j = 0; j < 4; ++j){ + values[j] = temp2[i*4+j]; + } + radix4(values, twiddleFactor); + for(uint j = 0; j < 4; ++j){ + temp2[i*4+j]=values[j]; + } + + } + for (uint i=0; i<2; i++){ + uint stageInvocationID = (gl_LocalInvocationID.x + (i+6)*gl_WorkGroupSize.x ) & (stageSize - 1u); + float angle = stageInvocationID * stageAngle; + vec2 twiddleFactor = vec2(cos(angle), sin(angle)); + vec2 values[4]; + for(uint j = 0; j < 4; ++j){ + values[j] = temp3[i*4+j]; + } + radix4(values, twiddleFactor); + for(uint j = 0; j < 4; ++j){ + temp3[i*4+j]=values[j]; + } + + } + { + vec2 sort0[8]; + vec2 sort1[8]; + for(uint j = 0; j < 2; ++j){ + sort0[j]=temp0[2+4*j]; + sort0[j+2]=temp1[2+4*j]; + + } + for(uint j = 0; j < 2; ++j){ + sort1[j]=temp0[3+4*j]; + sort1[j+2]=temp1[3+4*j]; + + } + for(uint j = 0; j < 2; ++j) + sort0[j+6]=temp0[4*j+1]; + + for(uint j = 0; j < 2; ++j) + temp0[j]=temp0[4*j]; + for(uint j = 0; j < 2; ++j) + temp0[j+2]=temp1[4*j]; + for(uint j = 0; j < 2; ++j) + temp0[j+4]=temp2[4*j]; + for(uint j = 0; j < 2; ++j) + temp0[j+6]=temp3[4*j]; + + for(uint j = 0; j < 2; ++j) + temp1[j+2]=temp1[4*j+1]; + for(uint j = 0; j < 2; ++j) + temp1[j]=sort0[j+6]; + for(uint j = 0; j < 2; ++j) + temp1[j+4]=temp2[4*j+1]; + for(uint j = 0; j < 2; ++j) + temp1[j+6]=temp3[4*j+1]; + + for(uint j = 0; j < 2; ++j){ + sort0[j+4]=temp2[2+4*j]; + sort0[j+6]=temp3[2+4*j]; + } + for(uint j = 0; j < 2; ++j){ + sort1[j+4]=temp2[3+4*j]; + sort1[j+6]=temp3[3+4*j]; + } + + for (uint j=0; j<8; j++){ + temp2[j]=sort0[j]; + temp3[j]=sort1[j];} + + } + + + //r2c decomp + memoryBarrierShared(); + barrier(); + for (uint i=0; i<4; i++){ + + float stageNormalization = (inverse) ? 0.25 : 1.0; + sdata[gl_LocalInvocationID.x + i*gl_WorkGroupSize.x]=temp0[i]*stageNormalization; + sdata[gl_LocalInvocationID.x + (i+4)*gl_WorkGroupSize.x]=temp3[i+4]*stageNormalization; + } + + memoryBarrierShared(); + barrier(); + if (gl_LocalInvocationID.x>0) { + for (uint i=0; i<4; i++){ + temp0[i].x=0.5*(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].x+sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].x); + temp0[i].y=0.5*(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].y-sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].y); + temp3[i+4].x=0.5*(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].y+sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].y); + temp3[i+4].y=0.5*(-sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].x+sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].x); + } + } else { + vec2 temp[2]; + temp[0].x=sdata[0].x; + temp[0].y=0; + temp[1].x=sdata[0].y; + temp[1].y=0; + if (zeropad_1){ + outputs[indexOutput(2*gl_GlobalInvocationID.y, gl_WorkGroupSize.y*2*gl_NumWorkGroups.y)]=temp[0]; + outputs[indexOutput(2*gl_GlobalInvocationID.y+1, gl_WorkGroupSize.y*2*gl_NumWorkGroups.y)]=temp[1]; + } else { + outputs[indexOutput(2*gl_GlobalInvocationID.y, gl_WorkGroupSize.y*gl_NumWorkGroups.y)]=temp[0]; + outputs[indexOutput(2*gl_GlobalInvocationID.y+1, gl_WorkGroupSize.y*gl_NumWorkGroups.y)]=temp[1]; + } + for (uint i=1; i<4; i++){ + temp0[i].x=0.5*(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].x+sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].x); + temp0[i].y=0.5*(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].y-sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].y); + temp3[i+4].x=0.5*(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].y+sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].y); + temp3[i+4].y=0.5*(-sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].x+sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].x); + } + } + memoryBarrierShared(); + barrier(); + for (uint i=0; i<4; i++){ + float stageNormalization = (inverse) ? 0.25 : 1.0; + sdata[gl_LocalInvocationID.x + i*gl_WorkGroupSize.x]=temp0[i+4]*stageNormalization; + } + memoryBarrierShared(); + barrier(); + if (gl_LocalInvocationID.x==0) + { + temp0[4].x=0.5*(sdata[0].x+sdata[4*gl_WorkGroupSize.x].x); + temp0[4].y=0.5*(sdata[0].y-sdata[4*gl_WorkGroupSize.x].y); + temp3[4].x=0.5*(sdata[0].y+sdata[4*gl_WorkGroupSize.x].y); + temp3[4].y=0.5*(-sdata[0].x+sdata[4*gl_WorkGroupSize.x].x); + } + memoryBarrierShared(); + barrier(); + for (uint i=0; i<4; i++){ + float stageNormalization = (inverse) ? 0.25 : 1.0; + sdata[gl_LocalInvocationID.x + (i+4)*gl_WorkGroupSize.x]=temp3[i]*stageNormalization; + } + memoryBarrierShared(); + barrier(); + if (gl_LocalInvocationID.x>0) { + for (uint i=0; i<4; i++){ + temp0[i+4].x=0.5*(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].x+sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].x); + temp0[i+4].y=0.5*(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].y-sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].y); + temp3[i].x=0.5*(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].y+sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].y); + temp3[i].y=0.5*(-sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].x+sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].x); + } + }else { + for (uint i=1; i<4; i++){ + temp0[i+4].x=0.5*(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].x+sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].x); + temp0[i+4].y=0.5*(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].y-sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].y); + temp3[i].x=0.5*(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].y+sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].y); + temp3[i].y=0.5*(-sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].x+sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].x); + } + } + memoryBarrierShared(); + barrier(); + for (uint i=0; i<4; i++){ + float stageNormalization = (inverse) ? 0.25 : 1.0; + sdata[gl_LocalInvocationID.x + i*gl_WorkGroupSize.x]=temp1[i]*stageNormalization; + } + memoryBarrierShared(); + barrier(); + if (gl_LocalInvocationID.x==0) + { + temp1[0].x=0.5*(sdata[0].x+sdata[4*gl_WorkGroupSize.x].x); + temp1[0].y=0.5*(sdata[0].y-sdata[4*gl_WorkGroupSize.x].y); + temp3[0].x=0.5*(sdata[0].y+sdata[4*gl_WorkGroupSize.x].y); + temp3[0].y=0.5*(-sdata[0].x+sdata[4*gl_WorkGroupSize.x].x); + } + memoryBarrierShared(); + barrier(); + for (uint i=0; i<4; i++){ + float stageNormalization = (inverse) ? 0.25 : 1.0; + sdata[gl_LocalInvocationID.x + (i+4)*gl_WorkGroupSize.x]=temp2[i+4]*stageNormalization; + } + memoryBarrierShared(); + barrier(); + + if (gl_LocalInvocationID.x>0) { + for (uint i=0; i<4; i++){ + temp1[i].x=0.5*(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].x+sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].x); + temp1[i].y=0.5*(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].y-sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].y); + temp2[i+4].x=0.5*(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].y+sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].y); + temp2[i+4].y=0.5*(-sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].x+sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].x); + } + }else { + for (uint i=1; i<4; i++){ + temp1[i].x=0.5*(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].x+sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].x); + temp1[i].y=0.5*(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].y-sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].y); + temp2[i+4].x=0.5*(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].y+sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].y); + temp2[i+4].y=0.5*(-sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].x+sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].x); + } + } + memoryBarrierShared(); + barrier(); + for (uint i=0; i<4; i++){ + float stageNormalization = (inverse) ? 0.25 : 1.0; + sdata[gl_LocalInvocationID.x + i*gl_WorkGroupSize.x]=temp1[i+4]*stageNormalization; + } + memoryBarrierShared(); + barrier(); + if (gl_LocalInvocationID.x==0) + { + temp1[4].x=0.5*(sdata[0].x+sdata[4*gl_WorkGroupSize.x].x); + temp1[4].y=0.5*(sdata[0].y-sdata[4*gl_WorkGroupSize.x].y); + temp2[4].x=0.5*(sdata[0].y+sdata[4*gl_WorkGroupSize.x].y); + temp2[4].y=0.5*(-sdata[0].x+sdata[4*gl_WorkGroupSize.x].x); + } + memoryBarrierShared(); + barrier(); + for (uint i=0; i<4; i++){ + float stageNormalization = (inverse) ? 0.25 : 1.0; + sdata[gl_LocalInvocationID.x + (i+4)*gl_WorkGroupSize.x]=temp2[i]*stageNormalization; + } + memoryBarrierShared(); + barrier(); + + if (gl_LocalInvocationID.x>0) { + for (uint i=0; i<4; i++){ + temp1[i+4].x=0.5*(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].x+sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].x); + temp1[i+4].y=0.5*(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].y-sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].y); + temp2[i].x=0.5*(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].y+sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].y); + temp2[i].y=0.5*(-sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].x+sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].x); + } + }else { + for (uint i=1; i<4; i++){ + temp1[i+4].x=0.5*(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].x+sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].x); + temp1[i+4].y=0.5*(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].y-sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].y); + temp2[i].x=0.5*(sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].y+sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].y); + temp2[i].y=0.5*(-sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].x+sdata[8*gl_WorkGroupSize.x-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].x); + } + temp0[0].x=(sdata[4*gl_WorkGroupSize.x].x); + temp0[0].y=0; + temp2[0].x=(sdata[4*gl_WorkGroupSize.x].y); + temp2[0].y=0; + } + //r2c save + if (ratioDirection_1){ + if (gl_LocalInvocationID.x>0){ + for (uint i=0; i<4; i++){ + outputs[indexOutput((gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1), gl_GlobalInvocationID.y)]=temp0[i]; + outputs[indexOutput((gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1), gl_GlobalInvocationID.y)]=temp0[i+4]; + outputs[indexOutput((gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x-1), gl_GlobalInvocationID.y)]=temp1[i]; + outputs[indexOutput((gl_LocalInvocationID.x+(i+12)*gl_WorkGroupSize.x-1), gl_GlobalInvocationID.y)]=temp1[i+4]; + outputs[indexOutput((gl_LocalInvocationID.x+(i+16)*gl_WorkGroupSize.x-1), gl_GlobalInvocationID.y)]=temp3[i+4]; + outputs[indexOutput((gl_LocalInvocationID.x+(i+20)*gl_WorkGroupSize.x-1), gl_GlobalInvocationID.y)]=temp3[i]; + outputs[indexOutput((gl_LocalInvocationID.x+(i+24)*gl_WorkGroupSize.x-1), gl_GlobalInvocationID.y)]=temp2[i+4]; + outputs[indexOutput((gl_LocalInvocationID.x+(i+28)*gl_WorkGroupSize.x-1), gl_GlobalInvocationID.y)]=temp2[i]; + } + } else{ + for (uint i=1; i<4; i++){ + outputs[indexOutput((gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1), gl_GlobalInvocationID.y)]=temp0[i]; + outputs[indexOutput((gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x-1), gl_GlobalInvocationID.y)]=temp0[i+4]; + outputs[indexOutput((gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x-1), gl_GlobalInvocationID.y)]=temp1[i]; + outputs[indexOutput((gl_LocalInvocationID.x+(i+12)*gl_WorkGroupSize.x-1), gl_GlobalInvocationID.y)]=temp1[i+4]; + outputs[indexOutput((gl_LocalInvocationID.x+(i+16)*gl_WorkGroupSize.x-1), gl_GlobalInvocationID.y)]=temp3[i+4]; + outputs[indexOutput((gl_LocalInvocationID.x+(i+20)*gl_WorkGroupSize.x-1), gl_GlobalInvocationID.y)]=temp3[i]; + outputs[indexOutput((gl_LocalInvocationID.x+(i+24)*gl_WorkGroupSize.x-1), gl_GlobalInvocationID.y)]=temp2[i+4]; + outputs[indexOutput((gl_LocalInvocationID.x+(i+28)*gl_WorkGroupSize.x-1), gl_GlobalInvocationID.y)]=temp2[i]; + } + outputs[indexOutput((4*gl_WorkGroupSize.x-1), gl_GlobalInvocationID.y)]=temp0[4]; + outputs[indexOutput((8*gl_WorkGroupSize.x-1), gl_GlobalInvocationID.y)]=temp1[0]; + outputs[indexOutput((12*gl_WorkGroupSize.x-1), gl_GlobalInvocationID.y)]=temp1[4]; + outputs[indexOutput((16*gl_WorkGroupSize.x-1), gl_GlobalInvocationID.y)]=temp0[0]; + outputs[indexOutput((20*gl_WorkGroupSize.x-1), gl_GlobalInvocationID.y)]=temp3[4]; + outputs[indexOutput((24*gl_WorkGroupSize.x-1), gl_GlobalInvocationID.y)]=temp3[0]; + outputs[indexOutput((28*gl_WorkGroupSize.x-1), gl_GlobalInvocationID.y)]=temp2[4]; + outputs[indexOutput((32*gl_WorkGroupSize.x-1), gl_GlobalInvocationID.y)]=temp2[0]; + + } + }else{ + if (gl_LocalInvocationID.x>0){ + for (uint i=0; i<4; i++){ + sdata[positionShuffle(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1)]=temp0[i]; + sdata[positionShuffle(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1)]=temp3[i+4]; + } + } else{ + for (uint i=1; i<4; i++){ + sdata[positionShuffle(i*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1)]=temp0[i]; + sdata[positionShuffle((i+8)*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1)]=temp3[i+4]; + } + sdata[positionShuffle((4*gl_WorkGroupSize.x-1), ratio_1, ratioDirection_1)]=temp0[4]; + sdata[positionShuffle((12*gl_WorkGroupSize.x-1), ratio_1, ratioDirection_1)]=temp3[4]; + } + memoryBarrierShared(); + barrier(); + for (uint i=0; i<8; i++){ + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, gl_GlobalInvocationID.y)]=sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x]; + } + + memoryBarrierShared(); + barrier(); + if (gl_LocalInvocationID.x>0){ + for (uint i=0; i<4; i++){ + sdata[positionShuffle(gl_LocalInvocationID.x+(i)*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1)]=temp0[i+4]; + sdata[positionShuffle(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1)]=temp3[i]; + } + } else{ + for (uint i=1; i<4; i++){ + sdata[positionShuffle((i)*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1)]=temp0[i+4]; + sdata[positionShuffle((i+8)*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1)]=temp3[i]; + } + sdata[positionShuffle((4*gl_WorkGroupSize.x-1), ratio_1, ratioDirection_1)]=temp1[0]; + sdata[positionShuffle((12*gl_WorkGroupSize.x-1), ratio_1, ratioDirection_1)]=temp3[0]; + } + + memoryBarrierShared(); + barrier(); + for (uint i=0; i<8; i++){ + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x+8*gl_WorkGroupSize.x, gl_GlobalInvocationID.y)]=sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x]; + } + memoryBarrierShared(); + barrier(); + if (gl_LocalInvocationID.x>0){ + for (uint i=0; i<4; i++){ + sdata[positionShuffle(gl_LocalInvocationID.x+(i)*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1)]=temp1[i]; + sdata[positionShuffle(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1)]=temp2[i+4]; + } + } else{ + for (uint i=1; i<4; i++){ + sdata[positionShuffle((i)*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1)]=temp1[i]; + sdata[positionShuffle((i+8)*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1)]=temp2[i+4]; + } + sdata[positionShuffle((4*gl_WorkGroupSize.x-1), ratio_1, ratioDirection_1)]=temp1[4]; + sdata[positionShuffle((12*gl_WorkGroupSize.x-1), ratio_1, ratioDirection_1)]=temp2[4]; + } + + memoryBarrierShared(); + barrier(); + for (uint i=0; i<8; i++){ + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x+16*gl_WorkGroupSize.x, gl_GlobalInvocationID.y)]=sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x]; + } + memoryBarrierShared(); + barrier(); + if (gl_LocalInvocationID.x>0){ + for (uint i=0; i<4; i++){ + sdata[positionShuffle(gl_LocalInvocationID.x+(i)*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1)]=temp1[i+4]; + sdata[positionShuffle(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1)]=temp2[i]; + } + } else{ + for (uint i=1; i<4; i++){ + sdata[positionShuffle((i)*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1)]=temp1[i+4]; + sdata[positionShuffle((i+8)*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1)]=temp2[i]; + } + sdata[positionShuffle((4*gl_WorkGroupSize.x-1), ratio_1, ratioDirection_1)]=temp0[0]; + sdata[positionShuffle((12*gl_WorkGroupSize.x-1), ratio_1, ratioDirection_1)]=temp2[0]; + } + + memoryBarrierShared(); + barrier(); + for (uint i=0; i<8; i++){ + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x+24*gl_WorkGroupSize.x, gl_GlobalInvocationID.y)]=sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x]; + } + } +} diff --git a/core/thirdparty/VkFFT/shaders/16384/vkFFT_single_r2c_16384.spv b/core/thirdparty/VkFFT/shaders/16384/vkFFT_single_r2c_16384.spv new file mode 100644 index 000000000..074fd52d3 Binary files /dev/null and b/core/thirdparty/VkFFT/shaders/16384/vkFFT_single_r2c_16384.spv differ diff --git a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_8192.comp b/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_8192.comp index 523e504cb..a9c6ff195 100644 --- a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_8192.comp +++ b/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_8192.comp @@ -3,10 +3,8 @@ const float M_PI = 3.1415926535897932384626433832795; const float M_SQRT1_2 = 0.70710678118654752440084436210485; - -layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;// 32, fft/8, 1: total <1024 layout (constant_id = 4) const uint fft_dim = 2048; - layout (constant_id = 5) const bool inverse = false; layout (constant_id = 6) const bool zeropad_0 = false; layout (constant_id = 7) const bool zeropad_1 = false; @@ -20,9 +18,9 @@ layout (constant_id = 14) const uint outputStride_1 = 1; layout (constant_id = 15) const uint outputStride_2 = 1; layout (constant_id = 16) const uint outputStride_3 = 1; layout (constant_id = 17) const uint outputStride_4 = 1; -layout (constant_id = 18) const uint radixStride_0 = 1; -layout (constant_id = 19) const uint radixStride_1 = 1; -layout (constant_id = 20) const uint radixStride_2 = 1; +layout (constant_id = 18) const uint fft_dim_full = 2048; +layout (constant_id = 19) const uint stageStartSize = 2048; +layout (constant_id = 20) const uint fft_dim_x = 2048; layout (constant_id = 21) const uint numStages = 1; layout (constant_id = 22) const uint stageRadix_0 = 8; layout (constant_id = 23) const uint stageRadix_1 = 8; @@ -32,6 +30,10 @@ layout (constant_id = 26) const bool ratioDirection_0 = false; layout (constant_id = 27) const bool ratioDirection_1 = true; layout (constant_id = 28) const uint inputOffset = 0; layout (constant_id = 29) const uint outputOffset = 0; +layout (constant_id = 30) const uint passID = 0; +const uint radixStride_0 = fft_dim/2; +const uint radixStride_1 = fft_dim/4; +const uint radixStride_2 = fft_dim/8; layout(push_constant) uniform PushConsts { @@ -199,21 +201,37 @@ void main() { uint stageSize=1; float stageAngle=(inverse) ? -M_PI : M_PI; if (zeropad_0&&(!inverse)){ - for(uint j = 0; j < 4; ++j){ - temp0[j]=inputs[indexInput(gl_LocalInvocationID.x+(j)*gl_WorkGroupSize.x)]; - temp1[j]=inputs[indexInput(gl_LocalInvocationID.x+(j+8)*gl_WorkGroupSize.x)]; + for(uint i = 0; i < 4; ++i){ + temp0[i]=inputs[indexInput(gl_LocalInvocationID.x+(i)*gl_WorkGroupSize.x+(gl_WorkGroupID.x)*fft_dim)]; + temp1[i]=inputs[indexInput(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x+(gl_WorkGroupID.x)*fft_dim)]; } - for(uint j = 4; j < 8; ++j){ - temp0[j]=vec2(0,0); - temp1[j]=vec2(0,0); + for(uint i = 4; i < 8; ++i){ + temp0[i]=vec2(0,0); + temp1[i]=vec2(0,0); } }else { - for(uint j = 0; j < 8; ++j){ - temp0[j]=inputs[indexInput(gl_LocalInvocationID.x+(j)*gl_WorkGroupSize.x)]; - temp1[j]=inputs[indexInput(gl_LocalInvocationID.x+(j+8)*gl_WorkGroupSize.x)]; + for(uint i = 0; i < 8; ++i){ + temp0[i]=inputs[indexInput(gl_LocalInvocationID.x+(i)*gl_WorkGroupSize.x+(gl_WorkGroupID.x)*fft_dim)]; + temp1[i]=inputs[indexInput(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x+(gl_WorkGroupID.x)*fft_dim)]; } } + if ((passID>0)&&(!inverse)){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_WorkGroupID.x)*(gl_LocalInvocationID.x+(i)*gl_WorkGroupSize.x))/float(fft_dim_full)); + vec2 mult = vec2(cos(angle),sin(angle)); + vec2 res=vec2(temp0[i].x*mult.x-temp0[i].y*mult.y, temp0[i].y*mult.x+temp0[i].x*mult.y); + temp0[i]=res; + } + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_WorkGroupID.x)*(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x))/float(fft_dim_full)); + vec2 mult = vec2(cos(angle),sin(angle)); + vec2 res=vec2(temp1[i].x*mult.x-temp1[i].y*mult.y, temp1[i].y*mult.x+temp1[i].x*mult.y); + temp1[i]=res; + } + memoryBarrierShared(); + barrier(); + } for (uint n=0; n < numStages-1; n++){//all stages but last are radix-8 { vec2 sort0[8]; @@ -367,18 +385,27 @@ void main() { memoryBarrierShared(); barrier(); - - if (zeropad_0&&(inverse)){ - float stageNormalization = (inverse) ? 0.5 : 1.0; - for (uint i=0; i<8; i++){ - sdata[gl_LocalInvocationID.x + i*gl_WorkGroupSize.x]=temp0[i]*stageNormalization; + if ((passID>0)&&(inverse)){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_WorkGroupID.x)*(gl_LocalInvocationID.x+(i)*gl_WorkGroupSize.x))/float(fft_dim_full)); + vec2 mult = vec2(cos(angle),-sin(angle)); + vec2 res=vec2(temp0[i].x*mult.x-temp0[i].y*mult.y, temp0[i].y*mult.x+temp0[i].x*mult.y); + temp0[i]=res; + } + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_WorkGroupID.x)*(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x))/float(fft_dim_full)); + vec2 mult = vec2(cos(angle),-sin(angle)); + vec2 res=vec2(temp1[i].x*mult.x-temp1[i].y*mult.y, temp1[i].y*mult.x+temp1[i].x*mult.y); + temp1[i]=res; } - memoryBarrierShared(); barrier(); + } + if (zeropad_0&&(inverse)){ + float stageNormalization = (inverse) ? 0.5 : 1.0; for (uint i=0; i<8; i++){ - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]=sdata[(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]; + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x+(gl_WorkGroupID.x)*fft_dim)]=temp0[i]*stageNormalization; } memoryBarrierShared(); barrier(); @@ -388,27 +415,13 @@ void main() { float stageNormalization = (inverse) ? 0.5 : 1.0; for (uint i=0; i<8; i++){ - sdata[gl_LocalInvocationID.x + i*gl_WorkGroupSize.x]=temp0[i]*stageNormalization; - } - - memoryBarrierShared(); - barrier(); - - for (uint i=0; i<8; i++){ - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]=sdata[(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]; + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x+(gl_WorkGroupID.x)*fft_dim)]=temp0[i]*stageNormalization; } memoryBarrierShared(); barrier(); - - for (uint i=0; i<8; i++){ - sdata[gl_LocalInvocationID.x + i*gl_WorkGroupSize.x]=temp1[i]*stageNormalization; - } - - memoryBarrierShared(); - barrier(); for (uint i=0; i<8; i++){ - outputs[indexOutput(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x)]=sdata[(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]; + outputs[indexOutput(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x+(gl_WorkGroupID.x)*fft_dim)]=temp1[i]*stageNormalization; } memoryBarrierShared(); barrier(); diff --git a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_8192.spv b/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_8192.spv index 1b8fb4149..333ed109b 100644 Binary files a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_8192.spv and b/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_8192.spv differ diff --git a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_afterR2C.comp b/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_afterR2C.comp deleted file mode 100644 index b727f08ac..000000000 --- a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_afterR2C.comp +++ /dev/null @@ -1,459 +0,0 @@ -#version 450 - -const float M_PI = 3.1415926535897932384626433832795; -const float M_SQRT1_2 = 0.70710678118654752440084436210485; - -layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; -layout (constant_id = 4) const uint fft_dim = 2048; - -layout(push_constant) uniform PushConsts -{ - bool inverse; - bool zeropad[2]; - uint inputStride[5]; - uint outputStride[5]; - uint radixStride[3]; - uint numStages; - uint stageRadix[2]; - uint ratio[2]; - bool ratioDirection[2]; - uint inputOffset; - uint outputOffset; - uint coordinate; - uint batchID; -} consts; - - -layout(std430, binding = 0) buffer Data { - vec2 inputs[]; -}; - -layout(std430, binding = 1) buffer Data2 { - vec2 outputs[]; -}; -uint indexInput(uint index) { - return consts.inputOffset+index * consts.inputStride[0] + gl_GlobalInvocationID.y * consts.inputStride[1] + gl_GlobalInvocationID.z * consts.inputStride[2] + consts.coordinate * consts.inputStride[3] + consts.batchID * consts.inputStride[4]; -} -uint indexOutput(uint index) { - return consts.outputOffset+index * consts.outputStride[0] + gl_GlobalInvocationID.y * consts.outputStride[1] + gl_GlobalInvocationID.z * consts.outputStride[2] + consts.coordinate * consts.outputStride[3] + consts.batchID * consts.outputStride[4]; -} -uint positionShuffle(uint pos, uint ratio, bool ratioDirection ) { - return (((pos)/(ratio))+((pos)%(ratio))*(fft_dim)); -} -void radix2(inout vec2 values[2], vec2 w) { - vec2 temp; - temp.x=values[1].x*w.x-values[1].y*w.y; - temp.y=values[1].y*w.x+values[1].x*w.y; - values[1]=values[0]-temp; - values[0]=values[0]+temp; -} - -void radix4(inout vec2 values[4],inout vec2 w) { - - //DIF 1st stage with double angle - vec2 temp; - temp.x=values[2].x*w.x-values[2].y*w.y; - temp.y=values[2].y*w.x+values[2].x*w.y; - values[2]=values[0]-temp; - values[0]=values[0]+temp; - - temp.x=values[3].x*w.x-values[3].y*w.y; - temp.y=values[3].y*w.x+values[3].x*w.y; - values[3]=values[1]-temp; - values[1]=values[1]+temp; - - //DIF 2nd stage with half angle - w = normalize(w + vec2(1.0, 0.0)); - - temp.x=values[1].x*w.x-values[1].y*w.y; - temp.y=values[1].y*w.x+values[1].x*w.y; - values[1]=values[0]-temp; - values[0]=values[0]+temp; - - w = (consts.inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); - - temp.x=values[3].x*w.x-values[3].y*w.y; - temp.y=values[3].y*w.x+values[3].x*w.y; - values[3]=values[2]-temp; - values[2]=values[2]+temp; - - temp = values[1]; - values[1]=values[2]; - values[2]=temp; -} - -void radix8(inout vec2 values[8], inout vec2 w) { - //DIF 1st stage with quadruple angle - - vec2 temp; - temp.x=values[4].x*w.x-values[4].y*w.y; - temp.y=values[4].y*w.x+values[4].x*w.y; - values[4]=values[0]-temp; - values[0]=values[0]+temp; - - temp.x=values[5].x*w.x-values[5].y*w.y; - temp.y=values[5].y*w.x+values[5].x*w.y; - values[5]=values[1]-temp; - values[1]=values[1]+temp; - - temp.x=values[6].x*w.x-values[6].y*w.y; - temp.y=values[6].y*w.x+values[6].x*w.y; - values[6]=values[2]-temp; - values[2]=values[2]+temp; - - temp.x=values[7].x*w.x-values[7].y*w.y; - temp.y=values[7].y*w.x+values[7].x*w.y; - values[7]=values[3]-temp; - values[3]=values[3]+temp; - - //DIF 2nd stage with double angle - w = normalize(w + vec2(1.0, 0.0)); - - temp.x=values[2].x*w.x-values[2].y*w.y; - temp.y=values[2].y*w.x+values[2].x*w.y; - values[2]=values[0]-temp; - values[0]=values[0]+temp; - - temp.x=values[3].x*w.x-values[3].y*w.y; - temp.y=values[3].y*w.x+values[3].x*w.y; - values[3]=values[1]-temp; - values[1]=values[1]+temp; - - vec2 iw = (consts.inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); - - temp.x=values[6].x*iw.x-values[6].y*iw.y; - temp.y=values[6].y*iw.x+values[6].x*iw.y; - values[6]=values[4]-temp; - values[4]=values[4]+temp; - - temp.x=values[7].x*iw.x-values[7].y*iw.y; - temp.y=values[7].y*iw.x+values[7].x*iw.y; - values[7]=values[5]-temp; - values[5]=values[5]+temp; - - //DIF 3rd stage with angle - w = normalize(w + vec2(1.0, 0.0)); - - temp.x=values[1].x*w.x-values[1].y*w.y; - temp.y=values[1].y*w.x+values[1].x*w.y; - values[1]=values[0]-temp; - values[0]=values[0]+temp; - - iw = (consts.inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); - - temp.x=values[3].x*iw.x-values[3].y*iw.y; - temp.y=values[3].y*iw.x+values[3].x*iw.y; - values[3]=values[2]-temp; - values[2]=values[2]+temp; - - iw.x=(consts.inverse) ? w.x*M_SQRT1_2+w.y*M_SQRT1_2 : w.x*M_SQRT1_2-w.y*M_SQRT1_2; - iw.y=(consts.inverse) ? w.y*M_SQRT1_2-w.x*M_SQRT1_2 : w.y*M_SQRT1_2+w.x*M_SQRT1_2; - - temp.x=values[5].x*iw.x-values[5].y*iw.y; - temp.y=values[5].y*iw.x+values[5].x*iw.y; - values[5]=values[4]-temp; - values[4]=values[4]+temp; - - w = (consts.inverse) ? vec2(iw.y, -iw.x) : vec2(-iw.y, iw.x); - - temp.x=values[7].x*w.x-values[7].y*w.y; - temp.y=values[7].y*w.x+values[7].x*w.y; - values[7]=values[6]-temp; - values[6]=values[6]+temp; - - temp = values[1]; - values[1]=values[4]; - values[4]=temp; - - temp = values[3]; - values[3]=values[6]; - values[6]=temp; - -} - -shared vec2 sdata[gl_WorkGroupSize.y*fft_dim]; - -void main() { - - vec2 temp0[8]; - vec2 temp1[8]; - uint stageSize=1; - float stageAngle=(consts.inverse) ? -M_PI : M_PI; - if (consts.zeropad[0]){ - if (consts.ratioDirection[0]){ - if (gl_WorkGroupID.y < gl_NumWorkGroups.y-1){ - for (uint i=0; i<8; i++){ - uint pos = positionShuffle(8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, consts.ratio[0], consts.ratioDirection[0]); - if (pos%fft_dim0) - return; - sdata[gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x, consts.coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, consts.coordinate)]; - sdata[(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, consts.coordinate)]; - sdata[(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, consts.coordinate)]; - sdata[(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=vec2(0,0); - } - } else{ - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x, consts.coordinate)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, consts.coordinate)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, consts.coordinate)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, consts.coordinate)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=vec2(0,0); - } - } else { - if (consts.ratioDirection[0]){ - if (gl_WorkGroupID.y < gl_NumWorkGroups.y-1){ - sdata[positionShuffle(8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x, consts.ratio[0], consts.ratioDirection[0])]=inputs[indexInput(gl_LocalInvocationID.x, consts.coordinate)]; - sdata[positionShuffle(8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+gl_WorkGroupSize.x, consts.ratio[0], consts.ratioDirection[0])]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, consts.coordinate)]; - sdata[positionShuffle(8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, consts.ratio[0], consts.ratioDirection[0])]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, consts.coordinate)]; - sdata[positionShuffle(8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, consts.ratio[0], consts.ratioDirection[0])]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, consts.coordinate)]; - sdata[positionShuffle(8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, consts.ratio[0], consts.ratioDirection[0])]=inputs[indexInput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, consts.coordinate)]; - sdata[positionShuffle(8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, consts.ratio[0], consts.ratioDirection[0])]=inputs[indexInput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, consts.coordinate)]; - sdata[positionShuffle(8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, consts.ratio[0], consts.ratioDirection[0])]=inputs[indexInput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, consts.coordinate)]; - sdata[positionShuffle(8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, consts.ratio[0], consts.ratioDirection[0])]=inputs[indexInput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, consts.coordinate)]; - }else{ - if (gl_LocalInvocationID.y>0) - return; - sdata[gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x, consts.coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, consts.coordinate)]; - sdata[(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, consts.coordinate)]; - sdata[(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, consts.coordinate)]; - sdata[(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, consts.coordinate)]; - sdata[(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, consts.coordinate)]; - sdata[(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, consts.coordinate)]; - sdata[(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, consts.coordinate)]; - } - } else{ - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x, consts.coordinate)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, consts.coordinate)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, consts.coordinate)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, consts.coordinate)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, consts.coordinate)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, consts.coordinate)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, consts.coordinate)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, consts.coordinate)]; - } - - } - memoryBarrierShared(); - barrier(); - - for (uint n=0; n < consts.numStages-1; n++){//all stages but last are radix-8 - { - vec2 sort0[8]; - for(uint j = 0; j < 4; ++j){ - sort0[j]=temp0[1+2*j]; - sort0[j+4]=temp1[1+2*j];} - - for(uint j = 0; j < 4; ++j) - temp0[j]=temp0[2*j]; - - for(uint j = 0; j < 4; ++j) - temp0[j+4]=temp1[2*j]; - - for (uint j=0; j<8; j++) - temp1[j]=sort0[j]; - } - { - uint stageInvocationID = (gl_LocalInvocationID.x) & (stageSize - 1u); - float angle = stageInvocationID * stageAngle; - vec2 twiddleFactor = vec2(cos(angle), sin(angle)); - vec2 values[8]; - for(uint j = 0; j < 8; ++j){ - values[j] =temp0[j]; - } - radix8(values, twiddleFactor); - for(uint j = 0; j < 8; ++j){ - temp0[j]=values[j]; - - } - - } - { - - uint stageInvocationID = (gl_LocalInvocationID.+gl_WorkGroupSize.x) & (stageSize - 1u); - float angle = stageInvocationID * stageAngle; - vec2 twiddleFactor = vec2(cos(angle), sin(angle)); - vec2 values[8]; - for(uint j = 0; j < 8; ++j){ - values[j] =temp1[j]; - } - radix8(values, twiddleFactor); - for(uint j = 0; j < 8; ++j){ - temp1[j]=values[j]; - - } - - } - memoryBarrierShared(); - barrier(); - - //all stages but last have no shifts larger than shared memory size - no need for swap buffer. Need to serialize thread groups in ratio amount of batches and exchange data - { - float stageNormalization = (consts.inverse) ? 0.125 : 1.0; - uint stageInvocationID = (gl_LocalInvocationID.x) & (stageSize - 1u); - uint blockInvocationID = (gl_LocalInvocationID.x) - stageInvocationID; - uint outputIndex = stageInvocationID + blockInvocationID * 8; - for(uint j = 0; j < 8; ++j){ - sdata[outputIndex+stageSize*j]=temp0[j]*stageNormalization; - } - memoryBarrierShared(); - barrier(); - - for (uint j=0; j<8; j++){ - temp0[j] = sdata[(gl_LocalInvocationID.x)+gl_WorkGroupSize.x*j]; - } - - memoryBarrierShared(); - barrier(); - - for(uint j = 0; j < 8; ++j){ - sdata[outputIndex+stageSize*j]=temp1[j]*stageNormalization; - } - memoryBarrierShared(); - barrier(); - - for (uint j=0; j<8; j++){ - temp1[j] = sdata[(gl_LocalInvocationID.x)+gl_WorkGroupSize.x*j]; - } - - memoryBarrierShared(); - barrier(); - - } - - stageSize=stageSize*8; - stageAngle=stageAngle/8.0f; - } - - - //last stage - arbitrary radix - //stageSize=4096; - { - vec2 sort0[8]; - for (uint t=0; t<4; t++){ - sort0[t*2]=temp0[t]; - sort0[t*2+1]=temp1[t]; - } - for (uint t=0; t<4; t++){ - temp0[t]=sort0[t]; - temp1[t]=sort0[t+4]; - } - for (uint t=0; t<4; t++){ - sort0[t*2]=temp0[t+4]; - sort0[t*2+1]=temp1[t+4]; - } - for (uint t=0; t<4; t++) - temp0[t+4]=temp1[t]; - - for (uint t=0; t<8; t++) - temp1[t]=sort0[t]; - - - } - - - for (uint i=0; i<4; i++){ - uint stageInvocationID = (gl_LocalInvocationID.x + i*gl_WorkGroupSize.x ) & (stageSize - 1u); - float angle = stageInvocationID * stageAngle; - vec2 twiddleFactor = vec2(cos(angle), sin(angle)); - - vec2 values[2]; - for(uint j = 0; j < 2; ++j){ - values[j] = temp0[i*2+j]; - } - radix2(values, twiddleFactor); - for(uint j = 0; j < 2; ++j){ - temp0[i*2+j]=values[j]; - } - - } - for (uint i=0; i<4; i++){ - uint stageInvocationID = (gl_LocalInvocationID.x + (i+4)*gl_WorkGroupSize.x ) & (stageSize - 1u); - float angle = stageInvocationID * stageAngle; - vec2 twiddleFactor = vec2(cos(angle), sin(angle)); - vec2 values[2]; - for(uint j = 0; j < 2; ++j){ - values[j] = temp1[i*2+j]; - } - radix2(values, twiddleFactor); - for(uint j = 0; j < 2; ++j){ - temp1[i*2+j]=values[j]; - } - - } - { - vec2 sort0[8]; - for (uint t=0; t<4; t++){ - sort0[t]=temp0[t*2+1]; - sort0[t+4]=temp1[t*2+1]; - } - for (uint t=0; t<4; t++) - temp0[t]=temp0[t*2]; - - for (uint t=0; t<4; t++) - temp0[t+4]=temp1[t*2]; - - for (uint t=0; t<8; t++) - temp1[t]=sort0[t]; - - } - memoryBarrierShared(); - barrier(); - - if (consts.zeropad[0]&&(consts.inverse)){ - float stageNormalization = (consts.inverse) ? 0.5 : 1.0; - for (uint i=0; i<8; i++){ - sdata[gl_LocalInvocationID.x + i*gl_WorkGroupSize.x]=temp0[i]*stageNormalization; - } - - memoryBarrierShared(); - barrier(); - - for (uint i=0; i<8; i++){ - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]=sdata[(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]; - } - memoryBarrierShared(); - barrier(); - } - else{ - - float stageNormalization = (consts.inverse) ? 0.5 : 1.0; - - for (uint i=0; i<8; i++){ - sdata[gl_LocalInvocationID.x + i*gl_WorkGroupSize.x]=temp0[i]*stageNormalization; - } - - memoryBarrierShared(); - barrier(); - - for (uint i=0; i<8; i++){ - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]=sdata[(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]; - } - memoryBarrierShared(); - barrier(); - - for (uint i=0; i<8; i++){ - sdata[gl_LocalInvocationID.x + i*gl_WorkGroupSize.x]=temp1[i]*stageNormalization; - } - - memoryBarrierShared(); - barrier(); - - for (uint i=0; i<8; i++){ - outputs[indexOutput(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x)]=sdata[(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]; - } - memoryBarrierShared(); - barrier(); - } - -} diff --git a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_afterR2C_for_transposition_8192.comp b/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_afterR2C_for_transposition_8192.comp deleted file mode 100644 index 0de250d38..000000000 --- a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_afterR2C_for_transposition_8192.comp +++ /dev/null @@ -1,824 +0,0 @@ -#version 450 - -const float M_PI = 3.1415926535897932384626433832795; -const float M_SQRT1_2 = 0.70710678118654752440084436210485; - -layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; -layout (constant_id = 4) const uint fft_dim = 2048; - -layout (constant_id = 5) const bool inverse = false; -layout (constant_id = 6) const bool zeropad_0 = false; -layout (constant_id = 7) const bool zeropad_1 = false; -layout (constant_id = 8) const uint inputStride_0 = 1; -layout (constant_id = 9) const uint inputStride_1 = 1; -layout (constant_id = 10) const uint inputStride_2 = 1; -layout (constant_id = 11) const uint inputStride_3 = 1; -layout (constant_id = 12) const uint inputStride_4 = 1; -layout (constant_id = 13) const uint outputStride_0 = 1; -layout (constant_id = 14) const uint outputStride_1 = 1; -layout (constant_id = 15) const uint outputStride_2 = 1; -layout (constant_id = 16) const uint outputStride_3 = 1; -layout (constant_id = 17) const uint outputStride_4 = 1; -layout (constant_id = 18) const uint radixStride_0 = 1; -layout (constant_id = 19) const uint radixStride_1 = 1; -layout (constant_id = 20) const uint radixStride_2 = 1; -layout (constant_id = 21) const uint numStages = 1; -layout (constant_id = 22) const uint stageRadix_0 = 8; -layout (constant_id = 23) const uint stageRadix_1 = 8; -layout (constant_id = 24) const uint ratio_0 = 8; -layout (constant_id = 25) const uint ratio_1 = 8; -layout (constant_id = 26) const bool ratioDirection_0 = false; -layout (constant_id = 27) const bool ratioDirection_1 = true; -layout (constant_id = 28) const uint inputOffset = 0; -layout (constant_id = 29) const uint outputOffset = 0; - -layout(push_constant) uniform PushConsts -{ - uint coordinate; - uint batchID; -} consts; - - -layout(std430, binding = 0) buffer Data { - vec2 inputs[]; -}; - -layout(std430, binding = 1) buffer Data2 { - vec2 outputs[]; -}; -uint indexInput(uint index_x, uint index_y) { - return inputOffset+index_x * inputStride_0 + index_y * inputStride_1 + gl_GlobalInvocationID.z * inputStride_2 + consts.coordinate * inputStride_3 + consts.batchID * inputStride_4; -} -uint indexOutput(uint index_x, uint index_y) { - return outputOffset+index_x * outputStride_0 + index_y * outputStride_1 + gl_GlobalInvocationID.z * outputStride_2 + consts.coordinate * outputStride_3 + consts.batchID * outputStride_4; -} -uint positionShuffle(uint pos, uint ratio, bool ratioDirection ) { - return (((pos)/(ratio))+((pos)%(ratio))*(fft_dim)); -} -void radix2(inout vec2 values[2], vec2 w) { - vec2 temp; - temp.x=values[1].x*w.x-values[1].y*w.y; - temp.y=values[1].y*w.x+values[1].x*w.y; - values[1]=values[0]-temp; - values[0]=values[0]+temp; -} - -void radix4(inout vec2 values[4],inout vec2 w) { - - //DIF 1st stage with double angle - vec2 temp; - temp.x=values[2].x*w.x-values[2].y*w.y; - temp.y=values[2].y*w.x+values[2].x*w.y; - values[2]=values[0]-temp; - values[0]=values[0]+temp; - - temp.x=values[3].x*w.x-values[3].y*w.y; - temp.y=values[3].y*w.x+values[3].x*w.y; - values[3]=values[1]-temp; - values[1]=values[1]+temp; - - //DIF 2nd stage with half angle - w = normalize(w + vec2(1.0, 0.0)); - - temp.x=values[1].x*w.x-values[1].y*w.y; - temp.y=values[1].y*w.x+values[1].x*w.y; - values[1]=values[0]-temp; - values[0]=values[0]+temp; - - w = (inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); - - temp.x=values[3].x*w.x-values[3].y*w.y; - temp.y=values[3].y*w.x+values[3].x*w.y; - values[3]=values[2]-temp; - values[2]=values[2]+temp; - - temp = values[1]; - values[1]=values[2]; - values[2]=temp; -} - -void radix8(inout vec2 values[8], inout vec2 w) { - //DIF 1st stage with quadruple angle - - vec2 temp; - temp.x=values[4].x*w.x-values[4].y*w.y; - temp.y=values[4].y*w.x+values[4].x*w.y; - values[4]=values[0]-temp; - values[0]=values[0]+temp; - - temp.x=values[5].x*w.x-values[5].y*w.y; - temp.y=values[5].y*w.x+values[5].x*w.y; - values[5]=values[1]-temp; - values[1]=values[1]+temp; - - temp.x=values[6].x*w.x-values[6].y*w.y; - temp.y=values[6].y*w.x+values[6].x*w.y; - values[6]=values[2]-temp; - values[2]=values[2]+temp; - - temp.x=values[7].x*w.x-values[7].y*w.y; - temp.y=values[7].y*w.x+values[7].x*w.y; - values[7]=values[3]-temp; - values[3]=values[3]+temp; - - //DIF 2nd stage with double angle - w = normalize(w + vec2(1.0, 0.0)); - - temp.x=values[2].x*w.x-values[2].y*w.y; - temp.y=values[2].y*w.x+values[2].x*w.y; - values[2]=values[0]-temp; - values[0]=values[0]+temp; - - temp.x=values[3].x*w.x-values[3].y*w.y; - temp.y=values[3].y*w.x+values[3].x*w.y; - values[3]=values[1]-temp; - values[1]=values[1]+temp; - - vec2 iw = (inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); - - temp.x=values[6].x*iw.x-values[6].y*iw.y; - temp.y=values[6].y*iw.x+values[6].x*iw.y; - values[6]=values[4]-temp; - values[4]=values[4]+temp; - - temp.x=values[7].x*iw.x-values[7].y*iw.y; - temp.y=values[7].y*iw.x+values[7].x*iw.y; - values[7]=values[5]-temp; - values[5]=values[5]+temp; - - //DIF 3rd stage with angle - w = normalize(w + vec2(1.0, 0.0)); - - temp.x=values[1].x*w.x-values[1].y*w.y; - temp.y=values[1].y*w.x+values[1].x*w.y; - values[1]=values[0]-temp; - values[0]=values[0]+temp; - - iw = (inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); - - temp.x=values[3].x*iw.x-values[3].y*iw.y; - temp.y=values[3].y*iw.x+values[3].x*iw.y; - values[3]=values[2]-temp; - values[2]=values[2]+temp; - - iw.x=(inverse) ? w.x*M_SQRT1_2+w.y*M_SQRT1_2 : w.x*M_SQRT1_2-w.y*M_SQRT1_2; - iw.y=(inverse) ? w.y*M_SQRT1_2-w.x*M_SQRT1_2 : w.y*M_SQRT1_2+w.x*M_SQRT1_2; - - temp.x=values[5].x*iw.x-values[5].y*iw.y; - temp.y=values[5].y*iw.x+values[5].x*iw.y; - values[5]=values[4]-temp; - values[4]=values[4]+temp; - - w = (inverse) ? vec2(iw.y, -iw.x) : vec2(-iw.y, iw.x); - - temp.x=values[7].x*w.x-values[7].y*w.y; - temp.y=values[7].y*w.x+values[7].x*w.y; - values[7]=values[6]-temp; - values[6]=values[6]+temp; - - temp = values[1]; - values[1]=values[4]; - values[4]=temp; - - temp = values[3]; - values[3]=values[6]; - values[6]=temp; - -} - -const uint max_shared_vec2=4096; -const uint last_ratio = 2;// reg mem/shared mem -const uint tempSize = fft_dim/gl_WorkGroupSize.x; -shared vec2 sdata[max_shared_vec2];// half real half imag - -void main() { - vec2 temp0[8]; - vec2 temp1[8]; - - if (zeropad_0){ - if (ratioDirection_0){ - if (gl_WorkGroupID.y < gl_NumWorkGroups.y-1){ - vec2 sort0[8]; - for (uint i=0; i<8; i++){ - uint pos=positionShuffle(8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - if (pos%fft_dim= max_shared_vec2) - sdata[pos-max_shared_vec2]=sort0[i]; - - pos=positionShuffle(max_shared_vec2+8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - - if (pos >= max_shared_vec2) - sdata[pos-max_shared_vec2]=temp1[i]; - } - memoryBarrierShared(); - barrier(); - - for (uint i=0; i<8; i++){ - temp1[i]=sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x]; - } - }else{ - if (gl_LocalInvocationID.y>0) - return; - for (uint i=0; i<4; i++){ - temp0[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y)]; - temp1[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y + gl_WorkGroupSize.y)]; - } - for (uint i=4; i<8; i++){ - temp0[i]=vec2(0,0); - temp1[i]=vec2(0,0); - - } - } - } else{ - for (uint i=0; i<4; i++){ - temp0[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y)]; - temp1[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y + gl_WorkGroupSize.y)]; - } - for (uint i=4; i<8; i++){ - temp0[i]=vec2(0,0); - temp1[i]=vec2(0,0); - - } - } - } else { - if (ratioDirection_0){ - if (gl_WorkGroupID.y < gl_NumWorkGroups.y-1){ - vec2 sort0[8]; - for (uint i=0; i<8; i++){ - sort0[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y)]; - temp1[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y + gl_WorkGroupSize.y)]; - - } - - for (uint i=0; i<8; i++){ - uint pos=positionShuffle(8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - - if (pos < max_shared_vec2) - sdata[pos]=sort0[i]; - - pos=positionShuffle(max_shared_vec2+8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - - if (pos < max_shared_vec2) - sdata[pos]=temp1[i]; - } - memoryBarrierShared(); - barrier(); - - for (uint i=0; i<8; i++){ - temp0[i]=sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x]; - } - memoryBarrierShared(); - barrier(); - - for (uint i=0; i<8; i++){ - uint pos=positionShuffle(8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - - if (pos >= max_shared_vec2) - sdata[pos-max_shared_vec2]=sort0[i]; - - pos=positionShuffle(max_shared_vec2+8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - - if (pos >= max_shared_vec2) - sdata[pos-max_shared_vec2]=temp1[i]; - } - memoryBarrierShared(); - barrier(); - - for (uint i=0; i<8; i++){ - temp1[i]=sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x]; - } - }else{ - if (gl_LocalInvocationID.y>0) - return; - for (uint i=0; i<8; i++){ - temp0[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y)]; - temp1[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y + gl_WorkGroupSize.y)]; - } - } - } else{ - for (uint i=0; i<8; i++){ - temp0[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y)]; - temp1[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y + gl_WorkGroupSize.y)]; - } - } - - } - memoryBarrierShared(); - barrier(); - - for (uint i=0; i<8; i++){ - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x]=temp0[i]; - } - - memoryBarrierShared(); - barrier(); - - uint stageSize=1; - float stageAngle=(inverse) ? -M_PI : M_PI; - //0-numStages stage - for (uint n=0; n < numStages; n++){ - uint current_radix = (n= max_shared_vec2) - sdata[pos-max_shared_vec2]=temp0[i]; - - pos=positionShuffle(max_shared_vec2+8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - - if (pos >= max_shared_vec2) - sdata[pos-max_shared_vec2]=temp1[i]; - } - memoryBarrierShared(); - barrier(); - for (uint i=0; i<8; i++){ - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y)]=sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x]; - } - } - -} diff --git a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_afterR2C_for_transposition_8192.spv b/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_afterR2C_for_transposition_8192.spv deleted file mode 100644 index 5c4deee32..000000000 Binary files a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_afterR2C_for_transposition_8192.spv and /dev/null differ diff --git a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_beforeC2R.comp b/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_beforeC2R.comp deleted file mode 100644 index bb5b910ef..000000000 --- a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_beforeC2R.comp +++ /dev/null @@ -1,454 +0,0 @@ -#version 450 - -const float M_PI = 3.1415926535897932384626433832795; -const float M_SQRT1_2 = 0.70710678118654752440084436210485; - -layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; -layout (constant_id = 4) const uint fft_dim = 2048; - -layout(push_constant) uniform PushConsts -{ - bool inverse; - bool zeropad[2]; - uint inputStride[4]; - uint outputStride[4]; - uint radixStride[3]; - uint numStages; - uint stageRadix[2]; - uint ratio[2]; - bool ratioDirection[2]; - uint inputOffset; - uint outputOffset; - uint coordinate; -} consts; - - -layout(std430, binding = 0) buffer Data { - vec2 inputs[]; -}; - -layout(std430, binding = 1) buffer Data2 { - vec2 outputs[]; -}; -uint indexInput(uint index, uint coordinate) { - return consts.inputOffset+index * consts.inputStride[0] + gl_GlobalInvocationID.y * consts.inputStride[1] + gl_GlobalInvocationID.z * consts.inputStride[2] + coordinate * consts.inputStride[3]; -} -uint indexOutput(uint index, uint coordinate) { - return consts.outputOffset+index * consts.outputStride[0] + gl_GlobalInvocationID.y * consts.outputStride[1] + gl_GlobalInvocationID.z * consts.outputStride[2] + coordinate * consts.outputStride[3]; -} -uint positionShuffle(uint pos, uint ratio, bool ratioDirection ) { - return (((pos)/(ratio))+((pos)%(ratio))*(fft_dim)); -} - -void radix2(inout vec2 values[2], vec2 w) { - vec2 temp; - temp.x=values[1].x*w.x-values[1].y*w.y; - temp.y=values[1].y*w.x+values[1].x*w.y; - values[1]=values[0]-temp; - values[0]=values[0]+temp; -} - -void radix4(inout vec2 values[4],inout vec2 w) { - - //DIF 1st stage with double angle - vec2 temp; - temp.x=values[2].x*w.x-values[2].y*w.y; - temp.y=values[2].y*w.x+values[2].x*w.y; - values[2]=values[0]-temp; - values[0]=values[0]+temp; - - temp.x=values[3].x*w.x-values[3].y*w.y; - temp.y=values[3].y*w.x+values[3].x*w.y; - values[3]=values[1]-temp; - values[1]=values[1]+temp; - - //DIF 2nd stage with half angle - w = normalize(w + vec2(1.0, 0.0)); - - temp.x=values[1].x*w.x-values[1].y*w.y; - temp.y=values[1].y*w.x+values[1].x*w.y; - values[1]=values[0]-temp; - values[0]=values[0]+temp; - - w = (consts.inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); - - temp.x=values[3].x*w.x-values[3].y*w.y; - temp.y=values[3].y*w.x+values[3].x*w.y; - values[3]=values[2]-temp; - values[2]=values[2]+temp; - - temp = values[1]; - values[1]=values[2]; - values[2]=temp; -} - -void radix8(inout vec2 values[8], inout vec2 w) { - //DIF 1st stage with quadruple angle - - vec2 temp; - temp.x=values[4].x*w.x-values[4].y*w.y; - temp.y=values[4].y*w.x+values[4].x*w.y; - values[4]=values[0]-temp; - values[0]=values[0]+temp; - - temp.x=values[5].x*w.x-values[5].y*w.y; - temp.y=values[5].y*w.x+values[5].x*w.y; - values[5]=values[1]-temp; - values[1]=values[1]+temp; - - temp.x=values[6].x*w.x-values[6].y*w.y; - temp.y=values[6].y*w.x+values[6].x*w.y; - values[6]=values[2]-temp; - values[2]=values[2]+temp; - - temp.x=values[7].x*w.x-values[7].y*w.y; - temp.y=values[7].y*w.x+values[7].x*w.y; - values[7]=values[3]-temp; - values[3]=values[3]+temp; - - //DIF 2nd stage with double angle - w = normalize(w + vec2(1.0, 0.0)); - - temp.x=values[2].x*w.x-values[2].y*w.y; - temp.y=values[2].y*w.x+values[2].x*w.y; - values[2]=values[0]-temp; - values[0]=values[0]+temp; - - temp.x=values[3].x*w.x-values[3].y*w.y; - temp.y=values[3].y*w.x+values[3].x*w.y; - values[3]=values[1]-temp; - values[1]=values[1]+temp; - - vec2 iw = (consts.inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); - - temp.x=values[6].x*iw.x-values[6].y*iw.y; - temp.y=values[6].y*iw.x+values[6].x*iw.y; - values[6]=values[4]-temp; - values[4]=values[4]+temp; - - temp.x=values[7].x*iw.x-values[7].y*iw.y; - temp.y=values[7].y*iw.x+values[7].x*iw.y; - values[7]=values[5]-temp; - values[5]=values[5]+temp; - - //DIF 3rd stage with angle - w = normalize(w + vec2(1.0, 0.0)); - - temp.x=values[1].x*w.x-values[1].y*w.y; - temp.y=values[1].y*w.x+values[1].x*w.y; - values[1]=values[0]-temp; - values[0]=values[0]+temp; - - iw = (consts.inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); - - temp.x=values[3].x*iw.x-values[3].y*iw.y; - temp.y=values[3].y*iw.x+values[3].x*iw.y; - values[3]=values[2]-temp; - values[2]=values[2]+temp; - - iw.x=(consts.inverse) ? w.x*M_SQRT1_2+w.y*M_SQRT1_2 : w.x*M_SQRT1_2-w.y*M_SQRT1_2; - iw.y=(consts.inverse) ? w.y*M_SQRT1_2-w.x*M_SQRT1_2 : w.y*M_SQRT1_2+w.x*M_SQRT1_2; - - temp.x=values[5].x*iw.x-values[5].y*iw.y; - temp.y=values[5].y*iw.x+values[5].x*iw.y; - values[5]=values[4]-temp; - values[4]=values[4]+temp; - - w = (consts.inverse) ? vec2(iw.y, -iw.x) : vec2(-iw.y, iw.x); - - temp.x=values[7].x*w.x-values[7].y*w.y; - temp.y=values[7].y*w.x+values[7].x*w.y; - values[7]=values[6]-temp; - values[6]=values[6]+temp; - - temp = values[1]; - values[1]=values[4]; - values[4]=temp; - - temp = values[3]; - values[3]=values[6]; - values[6]=temp; - -} -shared vec2 sdata[gl_WorkGroupSize.y*fft_dim]; - -void main() { - - if ((gl_WorkGroupID.y == gl_NumWorkGroups.y-1)&& (gl_LocalInvocationID.y>0)) - return; - - if (consts.ratioDirection[0]){ - for (uint i=0; i<8; i++) - sdata[positionShuffle(8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, consts.ratio[0], consts.ratioDirection[0])]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, consts.coordinate)]; - - }else{ - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x, consts.coordinate)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, consts.coordinate)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, consts.coordinate)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, consts.coordinate)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, consts.coordinate)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, consts.coordinate)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, consts.coordinate)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, consts.coordinate)]; - } - - - memoryBarrierShared(); - barrier(); - - vec2 temp[8]; - uint stageSize=1; - float stageAngle=(consts.inverse) ? -M_PI : M_PI; - //0-numStages stage - - for (uint n=0; n < consts.numStages; n++){ - uint current_radix = (n0)) - return; - - - vec2 temp0[8]; - vec2 temp1[8]; - - - if (ratioDirection_0){ - vec2 sort0[8]; - for (uint i=0; i<8; i++){ - sort0[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y)]; - temp1[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y + gl_WorkGroupSize.y)]; - - } - - for (uint i=0; i<8; i++){ - uint pos=positionShuffle(8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - - if (pos < max_shared_vec2) - sdata[pos]=sort0[i]; - - pos=positionShuffle(max_shared_vec2+8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - - if (pos < max_shared_vec2) - sdata[pos]=temp1[i]; - } - memoryBarrierShared(); - barrier(); - - for (uint i=0; i<8; i++){ - temp0[i]=sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x]; - } - memoryBarrierShared(); - barrier(); - - for (uint i=0; i<8; i++){ - uint pos=positionShuffle(8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - - if (pos >= max_shared_vec2) - sdata[pos-max_shared_vec2]=sort0[i]; - - pos=positionShuffle(max_shared_vec2+8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - - if (pos >= max_shared_vec2) - sdata[pos-max_shared_vec2]=temp1[i]; - } - memoryBarrierShared(); - barrier(); - - for (uint i=0; i<8; i++){ - temp1[i]=sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x]; - } - }else{ - for (uint i=0; i<8; i++){ - temp0[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y)]; - temp1[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y + gl_WorkGroupSize.y)]; - } - } - - memoryBarrierShared(); - barrier(); - - for (uint i=0; i<8; i++){ - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x]=temp0[i]; - } - - memoryBarrierShared(); - barrier(); - - uint stageSize=1; - float stageAngle=(inverse) ? -M_PI : M_PI; - //0-numStages stage - for (uint n=0; n < numStages; n++){ - uint current_radix = (n= max_shared_vec2) - sdata[pos-max_shared_vec2]=temp0[i]; - - pos=positionShuffle(max_shared_vec2+8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - - if (pos >= max_shared_vec2) - sdata[pos-max_shared_vec2]=temp1[i]; - } - memoryBarrierShared(); - barrier(); - for (uint i=0; i<4; i++){ - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y)]=sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x]; - } - }else{ - for (uint i=0; i<4; i++) - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y)]=temp0[i]; - for (uint i=0; i<4; i++) - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y + gl_WorkGroupSize.y)]=temp1[i]; - } - - } - } else { - if (ratioDirection_1){ - for (uint i=0; i<8; i++) - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y)]=temp0[i]; - for (uint i=0; i<8; i++) - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y + gl_WorkGroupSize.y)]=temp1[i]; - } - else{ - if (gl_WorkGroupID.y < gl_NumWorkGroups.y-1){ - for (uint i=0; i<8; i++){ - uint pos=positionShuffle(8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - - if (pos < max_shared_vec2) - sdata[pos]=temp0[i]; - - pos=positionShuffle(max_shared_vec2+8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - - if (pos < max_shared_vec2) - sdata[pos]=temp1[i]; - } - memoryBarrierShared(); - barrier(); - for (uint i=0; i<8; i++){ - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y)]=sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x]; - } - memoryBarrierShared(); - barrier(); - for (uint i=0; i<8; i++){ - uint pos=positionShuffle(8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - - if (pos >= max_shared_vec2) - sdata[pos-max_shared_vec2]=temp0[i]; - - pos=positionShuffle(max_shared_vec2+8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - - if (pos >= max_shared_vec2) - sdata[pos-max_shared_vec2]=temp1[i]; - } - memoryBarrierShared(); - barrier(); - for (uint i=0; i<8; i++){ - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y)]=sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x]; - } - }else{ - for (uint i=0; i<8; i++) - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y)]=temp0[i]; - for (uint i=0; i<8; i++) - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y + gl_WorkGroupSize.y)]=temp1[i]; - } - - } - } -} diff --git a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_beforeC2R_for_transposition_8192.spv b/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_beforeC2R_for_transposition_8192.spv deleted file mode 100644 index d4e13a3f1..000000000 Binary files a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_beforeC2R_for_transposition_8192.spv and /dev/null differ diff --git a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_for_transposition_8192.comp b/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_for_transposition_8192.comp deleted file mode 100644 index 716fb47d4..000000000 --- a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_for_transposition_8192.comp +++ /dev/null @@ -1,849 +0,0 @@ -#version 450 - -const float M_PI = 3.1415926535897932384626433832795; -const float M_SQRT1_2 = 0.70710678118654752440084436210485; - - -layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; -layout (constant_id = 4) const uint fft_dim = 2048; - -layout (constant_id = 5) const bool inverse = false; -layout (constant_id = 6) const bool zeropad_0 = false; -layout (constant_id = 7) const bool zeropad_1 = false; -layout (constant_id = 8) const uint inputStride_0 = 1; -layout (constant_id = 9) const uint inputStride_1 = 1; -layout (constant_id = 10) const uint inputStride_2 = 1; -layout (constant_id = 11) const uint inputStride_3 = 1; -layout (constant_id = 12) const uint inputStride_4 = 1; -layout (constant_id = 13) const uint outputStride_0 = 1; -layout (constant_id = 14) const uint outputStride_1 = 1; -layout (constant_id = 15) const uint outputStride_2 = 1; -layout (constant_id = 16) const uint outputStride_3 = 1; -layout (constant_id = 17) const uint outputStride_4 = 1; -layout (constant_id = 18) const uint radixStride_0 = 1; -layout (constant_id = 19) const uint radixStride_1 = 1; -layout (constant_id = 20) const uint radixStride_2 = 1; -layout (constant_id = 21) const uint numStages = 1; -layout (constant_id = 22) const uint stageRadix_0 = 8; -layout (constant_id = 23) const uint stageRadix_1 = 8; -layout (constant_id = 24) const uint ratio_0 = 8; -layout (constant_id = 25) const uint ratio_1 = 8; -layout (constant_id = 26) const bool ratioDirection_0 = false; -layout (constant_id = 27) const bool ratioDirection_1 = true; -layout (constant_id = 28) const uint inputOffset = 0; -layout (constant_id = 29) const uint outputOffset = 0; - -layout(push_constant) uniform PushConsts -{ - uint coordinate; - uint batchID; -} consts; - - -layout(std430, binding = 0) buffer Data { - vec2 inputs[]; -}; - -layout(std430, binding = 1) buffer Data2 { - vec2 outputs[]; -}; -uint indexInput(uint index_x, uint index_y) { - return inputOffset+index_x * inputStride_0 + index_y * inputStride_1 + 2 * gl_GlobalInvocationID.z * inputStride_2 + 2 * consts.coordinate * inputStride_3+ 2*consts.batchID * inputStride_4; -} -uint indexOutput(uint index_x, uint index_y) { - return outputOffset+index_x * outputStride_0 + index_y* outputStride_1 + 2 * gl_GlobalInvocationID.z * outputStride_2 + 2 * consts.coordinate * outputStride_3+ 2*consts.batchID * outputStride_4; -} -uint positionShuffle(uint pos, uint ratio, bool ratioDirection ) { - return (((pos)/(ratio))+((pos)%(ratio))*(fft_dim)); - -} -void radix2(inout vec2 values[2], vec2 w) { - vec2 temp; - temp.x=values[1].x*w.x-values[1].y*w.y; - temp.y=values[1].y*w.x+values[1].x*w.y; - values[1]=values[0]-temp; - values[0]=values[0]+temp; -} - -void radix4(inout vec2 values[4],inout vec2 w) { - - //DIF 1st stage with double angle - vec2 temp; - temp.x=values[2].x*w.x-values[2].y*w.y; - temp.y=values[2].y*w.x+values[2].x*w.y; - values[2]=values[0]-temp; - values[0]=values[0]+temp; - - temp.x=values[3].x*w.x-values[3].y*w.y; - temp.y=values[3].y*w.x+values[3].x*w.y; - values[3]=values[1]-temp; - values[1]=values[1]+temp; - - //DIF 2nd stage with half angle - w = normalize(w + vec2(1.0, 0.0)); - - temp.x=values[1].x*w.x-values[1].y*w.y; - temp.y=values[1].y*w.x+values[1].x*w.y; - values[1]=values[0]-temp; - values[0]=values[0]+temp; - - w = (inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); - - temp.x=values[3].x*w.x-values[3].y*w.y; - temp.y=values[3].y*w.x+values[3].x*w.y; - values[3]=values[2]-temp; - values[2]=values[2]+temp; - - temp = values[1]; - values[1]=values[2]; - values[2]=temp; -} - -void radix8(inout vec2 values[8], inout vec2 w) { - //DIF 1st stage with quadruple angle - - vec2 temp; - temp.x=values[4].x*w.x-values[4].y*w.y; - temp.y=values[4].y*w.x+values[4].x*w.y; - values[4]=values[0]-temp; - values[0]=values[0]+temp; - - temp.x=values[5].x*w.x-values[5].y*w.y; - temp.y=values[5].y*w.x+values[5].x*w.y; - values[5]=values[1]-temp; - values[1]=values[1]+temp; - - temp.x=values[6].x*w.x-values[6].y*w.y; - temp.y=values[6].y*w.x+values[6].x*w.y; - values[6]=values[2]-temp; - values[2]=values[2]+temp; - - temp.x=values[7].x*w.x-values[7].y*w.y; - temp.y=values[7].y*w.x+values[7].x*w.y; - values[7]=values[3]-temp; - values[3]=values[3]+temp; - - //DIF 2nd stage with double angle - w = normalize(w + vec2(1.0, 0.0)); - - temp.x=values[2].x*w.x-values[2].y*w.y; - temp.y=values[2].y*w.x+values[2].x*w.y; - values[2]=values[0]-temp; - values[0]=values[0]+temp; - - temp.x=values[3].x*w.x-values[3].y*w.y; - temp.y=values[3].y*w.x+values[3].x*w.y; - values[3]=values[1]-temp; - values[1]=values[1]+temp; - - vec2 iw = (inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); - - temp.x=values[6].x*iw.x-values[6].y*iw.y; - temp.y=values[6].y*iw.x+values[6].x*iw.y; - values[6]=values[4]-temp; - values[4]=values[4]+temp; - - temp.x=values[7].x*iw.x-values[7].y*iw.y; - temp.y=values[7].y*iw.x+values[7].x*iw.y; - values[7]=values[5]-temp; - values[5]=values[5]+temp; - - //DIF 3rd stage with angle - w = normalize(w + vec2(1.0, 0.0)); - - temp.x=values[1].x*w.x-values[1].y*w.y; - temp.y=values[1].y*w.x+values[1].x*w.y; - values[1]=values[0]-temp; - values[0]=values[0]+temp; - - iw = (inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); - - temp.x=values[3].x*iw.x-values[3].y*iw.y; - temp.y=values[3].y*iw.x+values[3].x*iw.y; - values[3]=values[2]-temp; - values[2]=values[2]+temp; - - iw.x=(inverse) ? w.x*M_SQRT1_2+w.y*M_SQRT1_2 : w.x*M_SQRT1_2-w.y*M_SQRT1_2; - iw.y=(inverse) ? w.y*M_SQRT1_2-w.x*M_SQRT1_2 : w.y*M_SQRT1_2+w.x*M_SQRT1_2; - - temp.x=values[5].x*iw.x-values[5].y*iw.y; - temp.y=values[5].y*iw.x+values[5].x*iw.y; - values[5]=values[4]-temp; - values[4]=values[4]+temp; - - w = (inverse) ? vec2(iw.y, -iw.x) : vec2(-iw.y, iw.x); - - temp.x=values[7].x*w.x-values[7].y*w.y; - temp.y=values[7].y*w.x+values[7].x*w.y; - values[7]=values[6]-temp; - values[6]=values[6]+temp; - - temp = values[1]; - values[1]=values[4]; - values[4]=temp; - - temp = values[3]; - values[3]=values[6]; - values[6]=temp; - -} - -const uint max_shared_vec2=4096; -const uint last_ratio = 2;// reg mem/shared mem -const uint tempSize = fft_dim/gl_WorkGroupSize.x; -shared vec2 sdata[max_shared_vec2];// half real half imag - -void main() { - - vec2 temp0[8]; - vec2 temp1[8]; - if ((zeropad_0)&&(!inverse)){ - if (ratioDirection_0){ - vec2 sort0[8]; - for (uint i=0; i<8; i++){ - uint pos=positionShuffle(8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - if (pos%fft_dim= max_shared_vec2) - sdata[pos-max_shared_vec2]=sort0[i]; - - pos=positionShuffle(max_shared_vec2+8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - - if (pos >= max_shared_vec2) - sdata[pos-max_shared_vec2]=temp1[i]; - } - memoryBarrierShared(); - barrier(); - - for (uint i=0; i<8; i++){ - temp1[i]=sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x]; - } - }else{ - for (uint i=0; i<4; i++){ - temp0[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y)]; - temp1[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y + gl_WorkGroupSize.y)]; - } - for (uint i=0; i<4; i++){ - temp0[i+4]=vec2(0,0); - temp1[i+4]=vec2(0,0); - - } - } - } else { - if (ratioDirection_0){ - vec2 sort0[8]; - for (uint i=0; i<8; i++){ - sort0[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y)]; - temp1[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y + gl_WorkGroupSize.y)]; - - } - - for (uint i=0; i<8; i++){ - uint pos=positionShuffle(8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - - if (pos < max_shared_vec2) - sdata[pos]=sort0[i]; - - pos=positionShuffle(max_shared_vec2+8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - - if (pos < max_shared_vec2) - sdata[pos]=temp1[i]; - } - memoryBarrierShared(); - barrier(); - - for (uint i=0; i<8; i++){ - temp0[i]=sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x]; - } - memoryBarrierShared(); - barrier(); - - for (uint i=0; i<8; i++){ - uint pos=positionShuffle(8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - - if (pos >= max_shared_vec2) - sdata[pos-max_shared_vec2]=sort0[i]; - - pos=positionShuffle(max_shared_vec2+8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - - if (pos >= max_shared_vec2) - sdata[pos-max_shared_vec2]=temp1[i]; - } - memoryBarrierShared(); - barrier(); - - for (uint i=0; i<8; i++){ - temp1[i]=sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x]; - } - }else{ - for (uint i=0; i<8; i++){ - temp0[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y)]; - temp1[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y + gl_WorkGroupSize.y)]; - } - } - } - memoryBarrierShared(); - barrier(); - - for (uint i=0; i<8; i++){ - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x]=temp0[i]; - } - - memoryBarrierShared(); - barrier(); - - uint stageSize=1; - float stageAngle=(inverse) ? -M_PI : M_PI; - //0-numStages stage - for (uint n=0; n < numStages; n++){ - uint current_radix = (n= max_shared_vec2) - sdata[pos-max_shared_vec2]=temp0[i]; - - pos=positionShuffle(max_shared_vec2+8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - - if (pos >= max_shared_vec2) - sdata[pos-max_shared_vec2]=temp1[i]; - } - memoryBarrierShared(); - barrier(); - for (uint i=0; i<4; i++){ - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y)]=sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x]; - } - } - } else { - if (ratioDirection_1){ - for (uint i=0; i<8; i++) - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y)]=temp0[i]; - for (uint i=0; i<8; i++) - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y + gl_WorkGroupSize.y)]=temp1[i]; - - }else{ - for (uint i=0; i<8; i++){ - uint pos=positionShuffle(8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - - if (pos < max_shared_vec2) - sdata[pos]=temp0[i]; - - pos=positionShuffle(max_shared_vec2+8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - - if (pos < max_shared_vec2) - sdata[pos]=temp1[i]; - } - memoryBarrierShared(); - barrier(); - for (uint i=0; i<8; i++){ - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y)]=sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x]; - } - memoryBarrierShared(); - barrier(); - for (uint i=0; i<8; i++){ - uint pos=positionShuffle(8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - - if (pos >= max_shared_vec2) - sdata[pos-max_shared_vec2]=temp0[i]; - - pos=positionShuffle(max_shared_vec2+8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - - if (pos >= max_shared_vec2) - sdata[pos-max_shared_vec2]=temp1[i]; - } - memoryBarrierShared(); - barrier(); - for (uint i=0; i<8; i++){ - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y)]=sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x]; - } - } - } - - - -} diff --git a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_for_transposition_8192.spv b/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_for_transposition_8192.spv deleted file mode 100644 index 1b84df799..000000000 Binary files a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2c_for_transposition_8192.spv and /dev/null differ diff --git a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2r_8192.comp b/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2r_8192.comp index 34fe1a701..85b0a25d9 100644 --- a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2r_8192.comp +++ b/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2r_8192.comp @@ -3,9 +3,8 @@ const float M_PI = 3.1415926535897932384626433832795; const float M_SQRT1_2 = 0.70710678118654752440084436210485; -layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;// 32, fft/8, 1: total <1024 layout (constant_id = 4) const uint fft_dim = 2048; - layout (constant_id = 5) const bool inverse = false; layout (constant_id = 6) const bool zeropad_0 = false; layout (constant_id = 7) const bool zeropad_1 = false; @@ -19,9 +18,9 @@ layout (constant_id = 14) const uint outputStride_1 = 1; layout (constant_id = 15) const uint outputStride_2 = 1; layout (constant_id = 16) const uint outputStride_3 = 1; layout (constant_id = 17) const uint outputStride_4 = 1; -layout (constant_id = 18) const uint radixStride_0 = 1; -layout (constant_id = 19) const uint radixStride_1 = 1; -layout (constant_id = 20) const uint radixStride_2 = 1; +layout (constant_id = 18) const uint fft_dim_full = 2048; +layout (constant_id = 19) const uint stageStartSize = 2048; +layout (constant_id = 20) const uint fft_dim_x = 2048; layout (constant_id = 21) const uint numStages = 1; layout (constant_id = 22) const uint stageRadix_0 = 8; layout (constant_id = 23) const uint stageRadix_1 = 8; @@ -31,6 +30,10 @@ layout (constant_id = 26) const bool ratioDirection_0 = false; layout (constant_id = 27) const bool ratioDirection_1 = true; layout (constant_id = 28) const uint inputOffset = 0; layout (constant_id = 29) const uint outputOffset = 0; +layout (constant_id = 30) const uint passID = 0; +const uint radixStride_0 = fft_dim/2; +const uint radixStride_1 = fft_dim/4; +const uint radixStride_2 = fft_dim/8; layout(push_constant) uniform PushConsts { @@ -55,7 +58,10 @@ uint indexOutput(uint index) { uint positionShuffle(uint pos, uint ratio, bool ratioDirection ) { - return (((pos)/(ratio))+((pos)%(ratio))*(fft_dim/4)); + if (ratioDirection) + return (((pos)/(ratio))+((pos)%(ratio))*(fft_dim/2)); + else + return (((pos)/(fft_dim/2))+((pos)%(fft_dim/2))*(ratio)); /*if (ratioDirection) return ((pos >> ratio)+(pos & (1<>1)); @@ -206,8 +212,19 @@ void main() { //c2r regroup if (ratioDirection_0){ + vec2 sort0[8]; + vec2 sort1[8]; for (uint i=0; i<8; i++){ - sdata[positionShuffle(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, gl_GlobalInvocationID.y)]; + sort0[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, gl_GlobalInvocationID.y)]; + sort1[i]=inputs[indexInput(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x, gl_GlobalInvocationID.y)]; + } + for (uint i=0; i<8; i++){ + uint pos= positionShuffle(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); + if (pos < fft_dim/4) + sdata[pos]=sort0[i]; + pos= positionShuffle(max_shared_vec2+gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); + if ((pos >= fft_dim/2)&&(pos < 3*fft_dim/4)) + sdata[pos-fft_dim/4]=sort1[i]; } memoryBarrierShared(); @@ -248,7 +265,13 @@ void main() { memoryBarrierShared(); barrier(); for (uint i=0; i<8; i++){ - sdata[positionShuffle(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x, gl_GlobalInvocationID.y)]; + uint pos= positionShuffle(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); + if ((pos >= fft_dim/4) && (pos < fft_dim/2)) + sdata[pos-fft_dim/4]=sort0[i]; + pos= positionShuffle(max_shared_vec2+gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); + if (pos >= 3*fft_dim/4) + sdata[pos-fft_dim/2]=sort1[i]; + } memoryBarrierShared(); @@ -503,16 +526,10 @@ void main() { if (zeropad_0){ float stageNormalization = (inverse) ? 0.5 : 1.0; - for (uint i=0; i<8; i++){ - sdata[gl_LocalInvocationID.x + i*gl_WorkGroupSize.x]=temp0[i]*stageNormalization; - } - - memoryBarrierShared(); - barrier(); for (uint i=0; i<8; i++){ - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]=sdata[(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].x; - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)+ outputStride_1]=sdata[(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].y; + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]=temp0[i].x*stageNormalization; + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)+ outputStride_1]=temp0[i].y*stageNormalization; } memoryBarrierShared(); barrier(); @@ -521,30 +538,17 @@ void main() { float stageNormalization = (inverse) ? 0.5 : 1.0; - for (uint i=0; i<8; i++){ - sdata[gl_LocalInvocationID.x + i*gl_WorkGroupSize.x]=temp0[i]*stageNormalization; - } - - memoryBarrierShared(); - barrier(); for (uint i=0; i<8; i++){ - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]=sdata[(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].x; - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)+ outputStride_1]=sdata[(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].y; + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]=temp0[i].x*stageNormalization; + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)+ outputStride_1]=temp0[i].y*stageNormalization; } memoryBarrierShared(); barrier(); - - for (uint i=0; i<8; i++){ - sdata[gl_LocalInvocationID.x + i*gl_WorkGroupSize.x]=temp1[i]*stageNormalization; - } - - memoryBarrierShared(); - barrier(); for (uint i=0; i<8; i++){ - outputs[indexOutput(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x)]=sdata[(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].x; - outputs[indexOutput(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x)+ outputStride_1]=sdata[(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].y; + outputs[indexOutput(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x)]=temp1[i].x*stageNormalization; + outputs[indexOutput(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x)+ outputStride_1]=temp1[i].y*stageNormalization; } memoryBarrierShared(); barrier(); diff --git a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2r_8192.spv b/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2r_8192.spv index 9d4adfb62..e99f37617 100644 Binary files a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2r_8192.spv and b/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2r_8192.spv differ diff --git a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2r_for_transposition_8192.comp b/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2r_for_transposition_8192.comp deleted file mode 100644 index 8a54426eb..000000000 --- a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_c2r_for_transposition_8192.comp +++ /dev/null @@ -1,692 +0,0 @@ -#version 450 - -const float M_PI = 3.1415926535897932384626433832795; -const float M_SQRT1_2 = 0.70710678118654752440084436210485; - -layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; -layout (constant_id = 4) const uint fft_dim = 2048; - -layout(push_constant) uniform PushConsts -{ - bool inverse; - bool zeropad[2]; - uint inputStride[5]; - uint outputStride[5]; - uint radixStride[3]; - uint numStages; - uint stageRadix[2]; - uint ratio[2]; - bool ratioDirection[2]; - uint inputOffset; - uint outputOffset; - uint coordinate; - uint batchID; -} consts; - -layout(std430, binding = 0) buffer Data { - vec2 inputs[]; -}; - -layout(std430, binding = 1) buffer Data2 { - float outputs[]; -}; -uint indexInput(uint index_x, uint index_y) { - return consts.inputOffset+index_x * consts.inputStride[0] + index_y * consts.inputStride[1] + gl_GlobalInvocationID.z * consts.inputStride[2] + consts.coordinate * consts.inputStride[3]+ consts.batchID * consts.inputStride[4] ; -} -uint indexOutput(uint index_x, uint index_y) { - return consts.outputOffset+index_x * consts.outputStride[0] + 2*index_y * consts.outputStride[1] + 2*gl_GlobalInvocationID.z * consts.outputStride[2] + 2*consts.coordinate * consts.outputStride[3] + 2*consts.batchID * consts.outputStride[4]; -} -uint positionShuffle(uint pos, uint ratio, bool ratioDirection ) { - if (ratioDirection) - return (((pos)/(ratio))+((pos)%(ratio))*(fft_dim/2)); - else - return (((pos)/(fft_dim/2))+((pos)%(fft_dim/2))*(ratio)); -} - -void radix2(inout vec2 values[2], vec2 w) { - vec2 temp; - temp.x=values[1].x*w.x-values[1].y*w.y; - temp.y=values[1].y*w.x+values[1].x*w.y; - values[1]=values[0]-temp; - values[0]=values[0]+temp; -} - -void radix4(inout vec2 values[4],inout vec2 w) { - - //DIF 1st stage with double angle - vec2 temp; - temp.x=values[2].x*w.x-values[2].y*w.y; - temp.y=values[2].y*w.x+values[2].x*w.y; - values[2]=values[0]-temp; - values[0]=values[0]+temp; - - temp.x=values[3].x*w.x-values[3].y*w.y; - temp.y=values[3].y*w.x+values[3].x*w.y; - values[3]=values[1]-temp; - values[1]=values[1]+temp; - - //DIF 2nd stage with half angle - w = normalize(w + vec2(1.0, 0.0)); - - temp.x=values[1].x*w.x-values[1].y*w.y; - temp.y=values[1].y*w.x+values[1].x*w.y; - values[1]=values[0]-temp; - values[0]=values[0]+temp; - - w = (consts.inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); - - temp.x=values[3].x*w.x-values[3].y*w.y; - temp.y=values[3].y*w.x+values[3].x*w.y; - values[3]=values[2]-temp; - values[2]=values[2]+temp; - - temp = values[1]; - values[1]=values[2]; - values[2]=temp; -} - -void radix8(inout vec2 values[8], inout vec2 w) { - //DIF 1st stage with quadruple angle - - vec2 temp; - temp.x=values[4].x*w.x-values[4].y*w.y; - temp.y=values[4].y*w.x+values[4].x*w.y; - values[4]=values[0]-temp; - values[0]=values[0]+temp; - - temp.x=values[5].x*w.x-values[5].y*w.y; - temp.y=values[5].y*w.x+values[5].x*w.y; - values[5]=values[1]-temp; - values[1]=values[1]+temp; - - temp.x=values[6].x*w.x-values[6].y*w.y; - temp.y=values[6].y*w.x+values[6].x*w.y; - values[6]=values[2]-temp; - values[2]=values[2]+temp; - - temp.x=values[7].x*w.x-values[7].y*w.y; - temp.y=values[7].y*w.x+values[7].x*w.y; - values[7]=values[3]-temp; - values[3]=values[3]+temp; - - //DIF 2nd stage with double angle - w = normalize(w + vec2(1.0, 0.0)); - - temp.x=values[2].x*w.x-values[2].y*w.y; - temp.y=values[2].y*w.x+values[2].x*w.y; - values[2]=values[0]-temp; - values[0]=values[0]+temp; - - temp.x=values[3].x*w.x-values[3].y*w.y; - temp.y=values[3].y*w.x+values[3].x*w.y; - values[3]=values[1]-temp; - values[1]=values[1]+temp; - - vec2 iw = (consts.inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); - - temp.x=values[6].x*iw.x-values[6].y*iw.y; - temp.y=values[6].y*iw.x+values[6].x*iw.y; - values[6]=values[4]-temp; - values[4]=values[4]+temp; - - temp.x=values[7].x*iw.x-values[7].y*iw.y; - temp.y=values[7].y*iw.x+values[7].x*iw.y; - values[7]=values[5]-temp; - values[5]=values[5]+temp; - - //DIF 3rd stage with angle - w = normalize(w + vec2(1.0, 0.0)); - - temp.x=values[1].x*w.x-values[1].y*w.y; - temp.y=values[1].y*w.x+values[1].x*w.y; - values[1]=values[0]-temp; - values[0]=values[0]+temp; - - iw = (consts.inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); - - temp.x=values[3].x*iw.x-values[3].y*iw.y; - temp.y=values[3].y*iw.x+values[3].x*iw.y; - values[3]=values[2]-temp; - values[2]=values[2]+temp; - - iw.x=(consts.inverse) ? w.x*M_SQRT1_2+w.y*M_SQRT1_2 : w.x*M_SQRT1_2-w.y*M_SQRT1_2; - iw.y=(consts.inverse) ? w.y*M_SQRT1_2-w.x*M_SQRT1_2 : w.y*M_SQRT1_2+w.x*M_SQRT1_2; - - temp.x=values[5].x*iw.x-values[5].y*iw.y; - temp.y=values[5].y*iw.x+values[5].x*iw.y; - values[5]=values[4]-temp; - values[4]=values[4]+temp; - - w = (consts.inverse) ? vec2(iw.y, -iw.x) : vec2(-iw.y, iw.x); - - temp.x=values[7].x*w.x-values[7].y*w.y; - temp.y=values[7].y*w.x+values[7].x*w.y; - values[7]=values[6]-temp; - values[6]=values[6]+temp; - - temp = values[1]; - values[1]=values[4]; - values[4]=temp; - - temp = values[3]; - values[3]=values[6]; - values[6]=temp; - -} - -const uint max_shared_vec2=4096; -const uint last_ratio = 2;// reg mem/shared mem -const uint tempSize = fft_dim/gl_WorkGroupSize.x; -shared vec2 sdata[max_shared_vec2];// half real half imag - -void main() { - - vec2 temp0[8]; - vec2 temp1[8]; - - //c2r regroup - if (consts.ratioDirection[0]){ - vec2 sort0[8]; - for (uint i=0; i<8; i++){ - sort0[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y)]; - temp1[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y + gl_WorkGroupSize.y)]; - - } - - for (uint i=0; i<8; i++){ - uint pos=positionShuffle(8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, consts.ratio[0], consts.ratioDirection[0]); - - if (pos < max_shared_vec2) - sdata[pos]=sort0[i]; - - pos=positionShuffle(max_shared_vec2+8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, consts.ratio[0], consts.ratioDirection[0]); - - if (pos < max_shared_vec2) - sdata[pos]=temp1[i]; - } - memoryBarrierShared(); - barrier(); - - for (uint i=0; i<8; i++){ - temp0[i]=sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x]; - } - memoryBarrierShared(); - barrier(); - - for (uint i=0; i<8; i++){ - uint pos=positionShuffle(8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, consts.ratio[0], consts.ratioDirection[0]); - - if (pos >= max_shared_vec2) - sdata[pos-max_shared_vec2]=sort0[i]; - - pos=positionShuffle(max_shared_vec2+8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, consts.ratio[0], consts.ratioDirection[0]); - - if (pos >= max_shared_vec2) - sdata[pos-max_shared_vec2]=temp1[i]; - } - memoryBarrierShared(); - barrier(); - - for (uint i=0; i<8; i++){ - temp1[i]=sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x]; - } - }else{ - for (uint i=0; i<8; i++){ - temp0[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y)]; - temp1[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y + gl_WorkGroupSize.y)]; - } - } - - for (uint i=0; i<4; i++){ - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + 1+gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].x=(temp0[i].x-temp0[i+4].y); - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + 1+gl_LocalInvocationID.x+i*gl_WorkGroupSize.x].y=(temp0[i].y+temp0[i+4].x); - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + fft_dim-1-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].x=(temp0[i].x+temp0[i+4].y); - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + fft_dim-1-(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)].y=(-temp0[i].y+temp0[i+4].x); - } - - memoryBarrierShared(); - barrier(); - - uint stageSize=1; - float stageAngle=(consts.inverse) ? -M_PI : M_PI; - - //0-numStages stage - for (uint n=0; n < consts.numStages; n++){ - uint current_radix = (n0){ for (uint i=0; i<4; i++){ - sdata[positionShuffle(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1)]=temp0[i]; - sdata[positionShuffle(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1)]=temp1[i+4]; + uint pos=positionShuffle(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_1, ratioDirection_1); + if (pos < max_shared_vec2) + sdata[pos]=temp0[i]; + pos=positionShuffle(gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x, ratio_1, ratioDirection_1); + if (pos < max_shared_vec2) + sdata[pos]=temp0[i+4]; + pos=positionShuffle(max_shared_vec2+gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_1, ratioDirection_1); + if (pos < max_shared_vec2) + sdata[pos]=temp1[i+4]; + pos=positionShuffle(max_shared_vec2+gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x, ratio_1, ratioDirection_1); + if (pos < max_shared_vec2) + sdata[pos]=temp1[i]; } } else{ for (uint i=1; i<4; i++){ - sdata[positionShuffle(i*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1)]=temp0[i]; - sdata[positionShuffle((i+8)*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1)]=temp1[i+4]; + uint pos=positionShuffle(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_1, ratioDirection_1); + if (pos < max_shared_vec2) + sdata[pos]=temp0[i]; + pos=positionShuffle(gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x, ratio_1, ratioDirection_1); + if (pos < max_shared_vec2) + sdata[pos]=temp0[i+4]; + pos=positionShuffle(max_shared_vec2+gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_1, ratioDirection_1); + if (pos < max_shared_vec2) + sdata[pos]=temp1[i+4]; + pos=positionShuffle(max_shared_vec2+gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x, ratio_1, ratioDirection_1); + if (pos < max_shared_vec2) + sdata[pos]=temp1[i]; } - sdata[positionShuffle((4*gl_WorkGroupSize.x-1), ratio_1, ratioDirection_1)]=temp0[4]; - sdata[positionShuffle((12*gl_WorkGroupSize.x-1), ratio_1, ratioDirection_1)]=temp1[4]; + uint pos=positionShuffle(4*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1); + if (pos < max_shared_vec2) + sdata[pos]=temp0[4]; + pos=positionShuffle(12*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1); + if (pos < max_shared_vec2) + sdata[pos]=temp1[4]; + pos=positionShuffle(8*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1); + if (pos < max_shared_vec2) + sdata[pos]=temp0[0]; + pos=positionShuffle(16*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1); + if (pos < max_shared_vec2) + sdata[pos]=temp1[0]; + } memoryBarrierShared(); barrier(); @@ -507,22 +538,54 @@ void main() { outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, gl_GlobalInvocationID.y)]=sdata[gl_LocalInvocationID.x+i*gl_WorkGroupSize.x]; } + memoryBarrierShared(); barrier(); + if (gl_LocalInvocationID.x>0){ for (uint i=0; i<4; i++){ - sdata[positionShuffle(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1)]=temp0[i+4]; - sdata[positionShuffle(gl_LocalInvocationID.x+(i+8)*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1)]=temp1[i]; + uint pos=positionShuffle(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_1, ratioDirection_1); + if (pos >= max_shared_vec2) + sdata[pos-max_shared_vec2]=temp0[i]; + pos=positionShuffle(gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x, ratio_1, ratioDirection_1); + if (pos >= max_shared_vec2) + sdata[pos-max_shared_vec2]=temp0[i+4]; + pos=positionShuffle(max_shared_vec2+gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_1, ratioDirection_1); + if (pos >= max_shared_vec2) + sdata[pos-max_shared_vec2]=temp1[i+4]; + pos=positionShuffle(max_shared_vec2+gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x, ratio_1, ratioDirection_1); + if (pos >= max_shared_vec2) + sdata[pos-max_shared_vec2]=temp1[i]; } } else{ for (uint i=1; i<4; i++){ - sdata[positionShuffle(i*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1)]=temp0[i+4]; - sdata[positionShuffle((i+8)*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1)]=temp1[i]; + uint pos=positionShuffle(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_1, ratioDirection_1); + if (pos >= max_shared_vec2) + sdata[pos-max_shared_vec2]=temp0[i]; + pos=positionShuffle(gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x, ratio_1, ratioDirection_1); + if (pos >= max_shared_vec2) + sdata[pos-max_shared_vec2]=temp0[i+4]; + pos=positionShuffle(max_shared_vec2+gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_1, ratioDirection_1); + if (pos >= max_shared_vec2) + sdata[pos-max_shared_vec2]=temp1[i+4]; + pos=positionShuffle(max_shared_vec2+gl_LocalInvocationID.x+(i+4)*gl_WorkGroupSize.x, ratio_1, ratioDirection_1); + if (pos >= max_shared_vec2) + sdata[pos-max_shared_vec2]=temp1[i]; } - sdata[positionShuffle((4*gl_WorkGroupSize.x-1), ratio_1, ratioDirection_1)]=temp0[0]; - sdata[positionShuffle((12*gl_WorkGroupSize.x-1), ratio_1, ratioDirection_1)]=temp1[0]; + uint pos=positionShuffle(4*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1); + if (pos >= max_shared_vec2) + sdata[pos-max_shared_vec2]=temp0[4]; + pos=positionShuffle(12*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1); + if (pos >= max_shared_vec2) + sdata[pos-max_shared_vec2]=temp1[4]; + pos=positionShuffle(8*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1); + if (pos >= max_shared_vec2) + sdata[pos-max_shared_vec2]=temp0[0]; + pos=positionShuffle(16*gl_WorkGroupSize.x-1, ratio_1, ratioDirection_1); + if (pos >= max_shared_vec2) + sdata[pos-max_shared_vec2]=temp1[0]; + } - memoryBarrierShared(); barrier(); for (uint i=0; i<8; i++){ diff --git a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_r2c_8192.spv b/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_r2c_8192.spv index 7bfcdf3c7..553323f7e 100644 Binary files a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_r2c_8192.spv and b/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_r2c_8192.spv differ diff --git a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_r2c_for_transposition_8192.comp b/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_r2c_for_transposition_8192.comp deleted file mode 100644 index 9bd2b43fe..000000000 --- a/core/thirdparty/VkFFT/shaders/8192/vkFFT_single_r2c_for_transposition_8192.comp +++ /dev/null @@ -1,481 +0,0 @@ -#version 450 - -const float M_PI = 3.1415926535897932384626433832795; -const float M_SQRT1_2 = 0.70710678118654752440084436210485; - -layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; -layout (constant_id = 4) const uint fft_dim = 2048; - -layout(push_constant) uniform PushConsts -{ - bool inverse; - bool zeropad[2]; - uint inputStride[5]; - uint outputStride[5]; - uint radixStride[3]; - uint numStages; - uint stageRadix[2]; - uint ratio[2]; - bool ratioDirection[2]; - uint inputOffset; - uint outputOffset; - uint coordinate; - uint batchID; -} consts; - - -layout(std430, binding = 0) buffer Data { - float inputs[]; -}; - -layout(std430, binding = 1) buffer Data2 { - vec2 outputs[]; -}; -uint indexInput(uint index) { - return consts.inputOffset+index * consts.inputStride[0] + 2*gl_GlobalInvocationID.y * consts.inputStride[1] + 2*gl_GlobalInvocationID.z * consts.inputStride[2] + 2*consts.coordinate * consts.inputStride[3] + 2*consts.batchID * consts.inputStride[4] ; -} -uint indexOutput(uint index_x, uint index_y) { - return consts.outputOffset+index_x * consts.outputStride[0] + index_y * consts.outputStride[1] + gl_GlobalInvocationID.z * consts.outputStride[2] + consts.coordinate * consts.outputStride[3]+ consts.batchID * consts.outputStride[4]; -} -uint positionShuffle(uint pos, uint ratio, bool ratioDirection ) { - if (ratioDirection) - return (((pos)/(ratio))+((pos)%(ratio))*(fft_dim/2)); - else - return (((pos)/(fft_dim/2))+((pos)%(fft_dim/2))*(ratio)); -} - -void radix2(inout vec2 values[2], vec2 w) { - vec2 temp; - temp.x=values[1].x*w.x-values[1].y*w.y; - temp.y=values[1].y*w.x+values[1].x*w.y; - values[1]=values[0]-temp; - values[0]=values[0]+temp; -} - -void radix4(inout vec2 values[4],inout vec2 w) { - - //DIF 1st stage with double angle - vec2 temp; - temp.x=values[2].x*w.x-values[2].y*w.y; - temp.y=values[2].y*w.x+values[2].x*w.y; - values[2]=values[0]-temp; - values[0]=values[0]+temp; - - temp.x=values[3].x*w.x-values[3].y*w.y; - temp.y=values[3].y*w.x+values[3].x*w.y; - values[3]=values[1]-temp; - values[1]=values[1]+temp; - - //DIF 2nd stage with half angle - w = normalize(w + vec2(1.0, 0.0)); - - temp.x=values[1].x*w.x-values[1].y*w.y; - temp.y=values[1].y*w.x+values[1].x*w.y; - values[1]=values[0]-temp; - values[0]=values[0]+temp; - - w = (consts.inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); - - temp.x=values[3].x*w.x-values[3].y*w.y; - temp.y=values[3].y*w.x+values[3].x*w.y; - values[3]=values[2]-temp; - values[2]=values[2]+temp; - - temp = values[1]; - values[1]=values[2]; - values[2]=temp; -} - -void radix8(inout vec2 values[8], inout vec2 w) { - //DIF 1st stage with quadruple angle - - vec2 temp; - temp.x=values[4].x*w.x-values[4].y*w.y; - temp.y=values[4].y*w.x+values[4].x*w.y; - values[4]=values[0]-temp; - values[0]=values[0]+temp; - - temp.x=values[5].x*w.x-values[5].y*w.y; - temp.y=values[5].y*w.x+values[5].x*w.y; - values[5]=values[1]-temp; - values[1]=values[1]+temp; - - temp.x=values[6].x*w.x-values[6].y*w.y; - temp.y=values[6].y*w.x+values[6].x*w.y; - values[6]=values[2]-temp; - values[2]=values[2]+temp; - - temp.x=values[7].x*w.x-values[7].y*w.y; - temp.y=values[7].y*w.x+values[7].x*w.y; - values[7]=values[3]-temp; - values[3]=values[3]+temp; - - //DIF 2nd stage with double angle - w = normalize(w + vec2(1.0, 0.0)); - - temp.x=values[2].x*w.x-values[2].y*w.y; - temp.y=values[2].y*w.x+values[2].x*w.y; - values[2]=values[0]-temp; - values[0]=values[0]+temp; - - temp.x=values[3].x*w.x-values[3].y*w.y; - temp.y=values[3].y*w.x+values[3].x*w.y; - values[3]=values[1]-temp; - values[1]=values[1]+temp; - - vec2 iw = (consts.inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); - - temp.x=values[6].x*iw.x-values[6].y*iw.y; - temp.y=values[6].y*iw.x+values[6].x*iw.y; - values[6]=values[4]-temp; - values[4]=values[4]+temp; - - temp.x=values[7].x*iw.x-values[7].y*iw.y; - temp.y=values[7].y*iw.x+values[7].x*iw.y; - values[7]=values[5]-temp; - values[5]=values[5]+temp; - - //DIF 3rd stage with angle - w = normalize(w + vec2(1.0, 0.0)); - - temp.x=values[1].x*w.x-values[1].y*w.y; - temp.y=values[1].y*w.x+values[1].x*w.y; - values[1]=values[0]-temp; - values[0]=values[0]+temp; - - iw = (consts.inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); - - temp.x=values[3].x*iw.x-values[3].y*iw.y; - temp.y=values[3].y*iw.x+values[3].x*iw.y; - values[3]=values[2]-temp; - values[2]=values[2]+temp; - - iw.x=(consts.inverse) ? w.x*M_SQRT1_2+w.y*M_SQRT1_2 : w.x*M_SQRT1_2-w.y*M_SQRT1_2; - iw.y=(consts.inverse) ? w.y*M_SQRT1_2-w.x*M_SQRT1_2 : w.y*M_SQRT1_2+w.x*M_SQRT1_2; - - temp.x=values[5].x*iw.x-values[5].y*iw.y; - temp.y=values[5].y*iw.x+values[5].x*iw.y; - values[5]=values[4]-temp; - values[4]=values[4]+temp; - - w = (consts.inverse) ? vec2(iw.y, -iw.x) : vec2(-iw.y, iw.x); - - temp.x=values[7].x*w.x-values[7].y*w.y; - temp.y=values[7].y*w.x+values[7].x*w.y; - values[7]=values[6]-temp; - values[6]=values[6]+temp; - - temp = values[1]; - values[1]=values[4]; - values[4]=temp; - - temp = values[3]; - values[3]=values[6]; - values[6]=temp; - -} - -const uint max_shared_vec2=4096; -const uint last_ratio = 2;// reg mem/shared mem -const uint tempSize = fft_dim/gl_WorkGroupSize.x; -shared vec2 sdata[max_shared_vec2];// half real half imag - -void main() { - vec2 temp0[8]; - vec2 temp1[8]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x].x=inputs[indexInput(gl_LocalInvocationID.x)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + gl_LocalInvocationID.x].y=inputs[indexInput(gl_LocalInvocationID.x)+consts.inputStride[1]]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)].x=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)].y=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x)+consts.inputStride[1]]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)].x=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)].y=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)+consts.inputStride[1]]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)].x=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)].y=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)+consts.inputStride[1]]; - if (consts.zeropad[0]){ - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)].x=0; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)].y=0; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)].x=0; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)].y=0; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)].x=0; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)].y=0; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)].x=0; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)].y=0; - } else { - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)].x=inputs[indexInput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)].y=inputs[indexInput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)+consts.inputStride[1]]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)].x=inputs[indexInput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)].y=inputs[indexInput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)+consts.inputStride[1]]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)].x=inputs[indexInput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)].y=inputs[indexInput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)+consts.inputStride[1]]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)].x=inputs[indexInput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]; - sdata[8*gl_WorkGroupSize.x*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)].y=inputs[indexInput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)+consts.inputStride[1]]; - } - if (consts.zeropad[0]){ - for (uint i=0; i<4; i++){ - temp0[i].x=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]; - temp0[i].y=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)+consts.inputStride[1]]; - temp1[i].x=inputs[indexInput(gl_LocalInvocationID.x+(i+16)*gl_WorkGroupSize.x)]; - temp1[i].y=inputs[indexInput(gl_LocalInvocationID.x+(i+16)*gl_WorkGroupSize.x)+consts.inputStride[1]]; - } - } else{ - for (uint i=0; i<8; i++){ - temp0[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y)]; - temp1[i]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 2*gl_WorkGroupSize.y * gl_WorkGroupID.y + gl_LocalInvocationID.y + gl_WorkGroupSize.y)]; - } - } - memoryBarrierShared(); - barrier(); - - vec2 temp[8]; - uint stageSize=1; - float stageAngle=(consts.inverse) ? -M_PI : M_PI; - - //0-numStages stage - for (uint n=0; n < consts.numStages; n++){ - uint current_radix = (n0)&&(!inverse)){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_GlobalInvocationID.x/fft_dim_x)*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))/float(fft_dim_full)); + uint index=(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)); + vec2 mult = vec2(cos(angle),sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; + } + memoryBarrierShared(); + barrier(); + } + float stageAngle=(inverse) ? -M_PI : M_PI; + + //0-numStages stage for (uint n=0; n < numStages; n++){ uint current_radix = (n0)&&(inverse)){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_GlobalInvocationID.x/fft_dim_x)*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))/float(fft_dim_full)); + uint index=(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)); + vec2 mult = vec2(cos(angle),-sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; + } + memoryBarrierShared(); + barrier(); + } if ((zeropad_0)&&(inverse)){ - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y))]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+gl_WorkGroupSize.y))]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+2*gl_WorkGroupSize.y))]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+2*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+3*gl_WorkGroupSize.y))]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+3*gl_WorkGroupSize.y))]; + for (uint i=0; i < 4; i++) + outputs[indexOutput(gl_GlobalInvocationID.x%(fft_dim_x), stageStartSize*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)%(stageStartSize)+(gl_GlobalInvocationID.x/fft_dim_x/stageStartSize)*(stageStartSize*fft_dim))]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]; } else { - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y))]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+gl_WorkGroupSize.y))]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+2*gl_WorkGroupSize.y))]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+2*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+3*gl_WorkGroupSize.y))]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+3*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+4*gl_WorkGroupSize.y))]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+4*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+5*gl_WorkGroupSize.y))]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+5*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+6*gl_WorkGroupSize.y))]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+6*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+7*gl_WorkGroupSize.y))]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+7*gl_WorkGroupSize.y))]; - + for (uint i=0; i < 8; i++) + outputs[indexOutput(gl_GlobalInvocationID.x%(fft_dim_x), stageStartSize*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)%(stageStartSize)+(gl_GlobalInvocationID.x/fft_dim_x/stageStartSize)*(stageStartSize*fft_dim))]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]; } + } diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_grouped_c2c.spv b/core/thirdparty/VkFFT/shaders/vkFFT_grouped_c2c.spv index c487c48b3..18162ba88 100644 Binary files a/core/thirdparty/VkFFT/shaders/vkFFT_grouped_c2c.spv and b/core/thirdparty/VkFFT/shaders/vkFFT_grouped_c2c.spv differ diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_1x1.comp b/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_1x1.comp index 55f02cb55..e43611929 100644 --- a/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_1x1.comp +++ b/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_1x1.comp @@ -3,10 +3,8 @@ const float M_PI = 3.1415926535897932384626433832795; const float M_SQRT1_2 = 0.70710678118654752440084436210485; - layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;// 32, fft/8, 1: total <1024 layout (constant_id = 4) const uint fft_dim = 2048; - layout (constant_id = 5) const bool inverse = false; layout (constant_id = 6) const bool zeropad_0 = false; layout (constant_id = 7) const bool zeropad_1 = false; @@ -20,9 +18,9 @@ layout (constant_id = 14) const uint outputStride_1 = 1; layout (constant_id = 15) const uint outputStride_2 = 1; layout (constant_id = 16) const uint outputStride_3 = 1; layout (constant_id = 17) const uint outputStride_4 = 1; -layout (constant_id = 18) const uint radixStride_0 = 1; -layout (constant_id = 19) const uint radixStride_1 = 1; -layout (constant_id = 20) const uint radixStride_2 = 1; +layout (constant_id = 18) const uint fft_dim_full = 2048; +layout (constant_id = 19) const uint stageStartSize = 2048; +layout (constant_id = 20) const uint fft_dim_x = 2048; layout (constant_id = 21) const uint numStages = 1; layout (constant_id = 22) const uint stageRadix_0 = 8; layout (constant_id = 23) const uint stageRadix_1 = 8; @@ -32,6 +30,10 @@ layout (constant_id = 26) const bool ratioDirection_0 = false; layout (constant_id = 27) const bool ratioDirection_1 = true; layout (constant_id = 28) const uint inputOffset = 0; layout (constant_id = 29) const uint outputOffset = 0; +layout (constant_id = 30) const uint passID = 0; +const uint radixStride_0 = fft_dim/2; +const uint radixStride_1 = fft_dim/4; +const uint radixStride_2 = fft_dim/8; layout(push_constant) uniform PushConsts { @@ -201,24 +203,28 @@ void main() { vec2 temp0[8]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+gl_WorkGroupSize.y)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+2*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+2*gl_WorkGroupSize.y)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+3*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+3*gl_WorkGroupSize.y)]; if (zeropad_0){ - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+4*gl_WorkGroupSize.y))]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+5*gl_WorkGroupSize.y))]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+6*gl_WorkGroupSize.y))]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+7*gl_WorkGroupSize.y))]=vec2(0,0); + for (uint i=0; i < 4; i++) + sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x%(fft_dim_x), (gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)*(fft_dim))]; + for (uint i=4; i < 8; i++) + sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]=vec2(0,0); } else { - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+4*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+4*gl_WorkGroupSize.y)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+5*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+5*gl_WorkGroupSize.y)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+6*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+6*gl_WorkGroupSize.y)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+7*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+7*gl_WorkGroupSize.y)]; + for (uint i=0; i < 8; i++) + sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x%(fft_dim_x), (gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)*(fft_dim))]; } memoryBarrierShared(); barrier(); - + if ((passID>0)){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_GlobalInvocationID.x/fft_dim_x)*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))/float(fft_dim_full)); + uint index=(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)); + vec2 mult = vec2(cos(angle),sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; + } + memoryBarrierShared(); + barrier(); + } uint stageSize=1; float stageAngle=M_PI; @@ -439,7 +445,8 @@ void main() { vec2 temp0_out[8]; //1x1 convolution for a 1d vector for (uint i=0; i<8; i++){ - uint icellkernel= indexInputKernel(gl_GlobalInvocationID.x, gl_GlobalInvocationID.y+i*gl_WorkGroupSize.y, batchID); + uint icellkernel= indexInputKernel(gl_GlobalInvocationID.x%(fft_dim_x), (gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)*(fft_dim), batchID); + temp0_out[i].x= kernel[icellkernel ].x * temp0[i].x - kernel[icellkernel ].y * temp0[i].y; temp0_out[i].y= kernel[icellkernel ].x * temp0[i].y + kernel[icellkernel ].y * temp0[i].x; @@ -664,17 +671,23 @@ void main() { memoryBarrierShared(); barrier(); } - - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y), batchID)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+gl_WorkGroupSize.y), batchID)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+2*gl_WorkGroupSize.y), batchID)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+2*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+3*gl_WorkGroupSize.y), batchID)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+3*gl_WorkGroupSize.y))]; - if (!zeropad_0){ - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+4*gl_WorkGroupSize.y), batchID)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+4*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+5*gl_WorkGroupSize.y), batchID)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+5*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+6*gl_WorkGroupSize.y), batchID)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+6*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+7*gl_WorkGroupSize.y), batchID)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+7*gl_WorkGroupSize.y))]; - + if (passID>0){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_GlobalInvocationID.x/fft_dim_x)*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))/float(fft_dim_full)); + uint index=(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)); + vec2 mult = vec2(cos(angle),-sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; + } + memoryBarrierShared(); + barrier(); + } + if (zeropad_0){ + for (uint i=0; i < 4; i++) + outputs[indexOutput(gl_GlobalInvocationID.x%(fft_dim_x), (gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)*(fft_dim), batchID)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]; + } else { + for (uint i=0; i < 8; i++) + outputs[indexOutput(gl_GlobalInvocationID.x%(fft_dim_x), (gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)*(fft_dim), batchID)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]; } memoryBarrierShared(); barrier(); diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_1x1.spv b/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_1x1.spv index c1c03c202..a95fc88b1 100644 Binary files a/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_1x1.spv and b/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_1x1.spv differ diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_nonsymmetric_2x2.comp b/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_nonsymmetric_2x2.comp index 6f843714f..4cd5294c2 100644 --- a/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_nonsymmetric_2x2.comp +++ b/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_nonsymmetric_2x2.comp @@ -5,7 +5,6 @@ const float M_SQRT1_2 = 0.70710678118654752440084436210485; layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;// 32, fft/8, 1: total <1024 layout (constant_id = 4) const uint fft_dim = 2048; - layout (constant_id = 5) const bool inverse = false; layout (constant_id = 6) const bool zeropad_0 = false; layout (constant_id = 7) const bool zeropad_1 = false; @@ -19,9 +18,9 @@ layout (constant_id = 14) const uint outputStride_1 = 1; layout (constant_id = 15) const uint outputStride_2 = 1; layout (constant_id = 16) const uint outputStride_3 = 1; layout (constant_id = 17) const uint outputStride_4 = 1; -layout (constant_id = 18) const uint radixStride_0 = 1; -layout (constant_id = 19) const uint radixStride_1 = 1; -layout (constant_id = 20) const uint radixStride_2 = 1; +layout (constant_id = 18) const uint fft_dim_full = 2048; +layout (constant_id = 19) const uint stageStartSize = 2048; +layout (constant_id = 20) const uint fft_dim_x = 2048; layout (constant_id = 21) const uint numStages = 1; layout (constant_id = 22) const uint stageRadix_0 = 8; layout (constant_id = 23) const uint stageRadix_1 = 8; @@ -31,6 +30,10 @@ layout (constant_id = 26) const bool ratioDirection_0 = false; layout (constant_id = 27) const bool ratioDirection_1 = true; layout (constant_id = 28) const uint inputOffset = 0; layout (constant_id = 29) const uint outputOffset = 0; +layout (constant_id = 30) const uint passID = 0; +const uint radixStride_0 = fft_dim/2; +const uint radixStride_1 = fft_dim/4; +const uint radixStride_2 = fft_dim/8; layout(push_constant) uniform PushConsts { @@ -200,24 +203,28 @@ void main() { for(uint coordinate=0; coordinate<2; coordinate++){ - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+gl_WorkGroupSize.y, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+2*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+2*gl_WorkGroupSize.y, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+3*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+3*gl_WorkGroupSize.y, coordinate)]; if (zeropad_0){ - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+4*gl_WorkGroupSize.y))]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+5*gl_WorkGroupSize.y))]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+6*gl_WorkGroupSize.y))]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+7*gl_WorkGroupSize.y))]=vec2(0,0); + for (uint i=0; i < 4; i++) + sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x%(fft_dim_x), (gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)*(fft_dim), coordinate)]; + for (uint i=4; i < 8; i++) + sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]=vec2(0,0); } else { - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+4*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+4*gl_WorkGroupSize.y, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+5*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+5*gl_WorkGroupSize.y, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+6*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+6*gl_WorkGroupSize.y, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+7*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+7*gl_WorkGroupSize.y, coordinate)]; + for (uint i=0; i < 8; i++) + sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x%(fft_dim_x), (gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)*(fft_dim), coordinate)]; } memoryBarrierShared(); barrier(); - + if (passID>0){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_GlobalInvocationID.x/fft_dim_x)*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))/float(fft_dim_full)); + uint index=(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)); + vec2 mult = vec2(cos(angle),sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; + } + memoryBarrierShared(); + barrier(); + } uint stageSize=1; float stageAngle=M_PI; @@ -452,7 +459,7 @@ void main() { //2x2 nonsymmetric convolution for a 2d vector for (uint i=0; i<8; i++){ - uint icellkernel= indexInput(gl_GlobalInvocationID.x, gl_GlobalInvocationID.y+i*gl_WorkGroupSize.y, 0); + uint icellkernel= indexInput(gl_GlobalInvocationID.x%(fft_dim_x), (gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)%(stageStartSize)+(gl_GlobalInvocationID.x/fft_dim_x/stageStartSize)*(fft_dim), 0); float temp_spin_real0 = kernel[icellkernel ].x * temp0[i].x + kernel[icellkernel+inputStride_3].x * temp1[i].x - kernel[icellkernel ].y * temp0[i].y - kernel[icellkernel+inputStride_3].y * temp1[i].y; float temp_spin_imag0 = kernel[icellkernel ].x * temp0[i].y + kernel[icellkernel+inputStride_3].x * temp1[i].y + kernel[icellkernel ].y * temp0[i].x + kernel[icellkernel+inputStride_3].y * temp1[i].x; float temp_spin_real1 = kernel[icellkernel+2*inputStride_3].x * temp0[i].x + kernel[icellkernel+3*inputStride_3].x * temp1[i].x - kernel[icellkernel+2*inputStride_3].y * temp0[i].y - kernel[icellkernel+3*inputStride_3].y * temp1[i].y; @@ -697,16 +704,24 @@ void main() { barrier(); } - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+2*gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+2*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+3*gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+3*gl_WorkGroupSize.y))]; - if (!zeropad_0){ - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+4*gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+4*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+5*gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+5*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+6*gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+6*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+7*gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+7*gl_WorkGroupSize.y))]; - } + if (passID>0){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_GlobalInvocationID.x/fft_dim_x)*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))/float(fft_dim_full)); + uint index=(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)); + vec2 mult = vec2(cos(angle),-sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; + } + memoryBarrierShared(); + barrier(); + } + if (zeropad_0){ + for (uint i=0; i < 4; i++) + outputs[indexOutput(gl_GlobalInvocationID.x%(fft_dim_x), (gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)*(fft_dim), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]; + } else { + for (uint i=0; i < 8; i++) + outputs[indexOutput(gl_GlobalInvocationID.x%(fft_dim_x), (gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)*(fft_dim), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]; + } memoryBarrierShared(); barrier(); } diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_nonsymmetric_2x2.spv b/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_nonsymmetric_2x2.spv index 24befadf4..3ec2c99d9 100644 Binary files a/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_nonsymmetric_2x2.spv and b/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_nonsymmetric_2x2.spv differ diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_nonsymmetric_3x3.comp b/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_nonsymmetric_3x3.comp index 6a21b4477..da2afb91b 100644 --- a/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_nonsymmetric_3x3.comp +++ b/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_nonsymmetric_3x3.comp @@ -5,7 +5,6 @@ const float M_SQRT1_2 = 0.70710678118654752440084436210485; layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;// 32, fft/8, 1: total <1024 layout (constant_id = 4) const uint fft_dim = 2048; - layout (constant_id = 5) const bool inverse = false; layout (constant_id = 6) const bool zeropad_0 = false; layout (constant_id = 7) const bool zeropad_1 = false; @@ -19,9 +18,9 @@ layout (constant_id = 14) const uint outputStride_1 = 1; layout (constant_id = 15) const uint outputStride_2 = 1; layout (constant_id = 16) const uint outputStride_3 = 1; layout (constant_id = 17) const uint outputStride_4 = 1; -layout (constant_id = 18) const uint radixStride_0 = 1; -layout (constant_id = 19) const uint radixStride_1 = 1; -layout (constant_id = 20) const uint radixStride_2 = 1; +layout (constant_id = 18) const uint fft_dim_full = 2048; +layout (constant_id = 19) const uint stageStartSize = 2048; +layout (constant_id = 20) const uint fft_dim_x = 2048; layout (constant_id = 21) const uint numStages = 1; layout (constant_id = 22) const uint stageRadix_0 = 8; layout (constant_id = 23) const uint stageRadix_1 = 8; @@ -31,6 +30,10 @@ layout (constant_id = 26) const bool ratioDirection_0 = false; layout (constant_id = 27) const bool ratioDirection_1 = true; layout (constant_id = 28) const uint inputOffset = 0; layout (constant_id = 29) const uint outputOffset = 0; +layout (constant_id = 30) const uint passID = 0; +const uint radixStride_0 = fft_dim/2; +const uint radixStride_1 = fft_dim/4; +const uint radixStride_2 = fft_dim/8; layout(push_constant) uniform PushConsts { @@ -200,24 +203,30 @@ void main() { vec2 temp2[8]; for(uint coordinate=0; coordinate<3; coordinate++){ - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+gl_WorkGroupSize.y, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+2*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+2*gl_WorkGroupSize.y, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+3*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+3*gl_WorkGroupSize.y, coordinate)]; if (zeropad_0){ - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+4*gl_WorkGroupSize.y))]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+5*gl_WorkGroupSize.y))]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+6*gl_WorkGroupSize.y))]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+7*gl_WorkGroupSize.y))]=vec2(0,0); + for (uint i=0; i < 4; i++) + sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x%(fft_dim_x), (gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)*(fft_dim), coordinate)]; + for (uint i=4; i < 8; i++) + sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]=vec2(0,0); } else { - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+4*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+4*gl_WorkGroupSize.y, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+5*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+5*gl_WorkGroupSize.y, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+6*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+6*gl_WorkGroupSize.y, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+7*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+7*gl_WorkGroupSize.y, coordinate)]; + for (uint i=0; i < 8; i++) + sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x%(fft_dim_x), (gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)*(fft_dim), coordinate)]; } memoryBarrierShared(); barrier(); + if (passID>0){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_GlobalInvocationID.x/fft_dim_x)*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))/float(fft_dim_full)); + uint index=(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)); + vec2 mult = vec2(cos(angle),sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; + } + memoryBarrierShared(); + barrier(); + } + uint stageSize=1; float stageAngle=M_PI; @@ -464,7 +473,7 @@ void main() { //3x3 nonsymmetric convolution for a 3d vector for (uint i=0; i<8; i++){ - uint icellkernel= indexInput(gl_GlobalInvocationID.x, gl_GlobalInvocationID.y+i*gl_WorkGroupSize.y, 0); + uint icellkernel= indexInput(gl_GlobalInvocationID.x%(fft_dim_x), (gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)%(stageStartSize)+(gl_GlobalInvocationID.x/fft_dim_x/stageStartSize)*(fft_dim), 0); float temp_spin_real0 = kernel[icellkernel ].x * temp0[i].x + kernel[icellkernel+inputStride_3].x * temp1[i].x + kernel[icellkernel+2*inputStride_3].x * temp2[i].x - kernel[icellkernel ].y * temp0[i].y - kernel[icellkernel+inputStride_3].y * temp1[i].y - kernel[icellkernel+2*inputStride_3].y * temp2[i].y; float temp_spin_imag0 = kernel[icellkernel ].x * temp0[i].y + kernel[icellkernel+inputStride_3].x * temp1[i].y + kernel[icellkernel+2*inputStride_3].x * temp2[i].y + kernel[icellkernel ].y * temp0[i].x + kernel[icellkernel+inputStride_3].y * temp1[i].x + kernel[icellkernel+2*inputStride_3].y * temp2[i].x; float temp_spin_real1 = kernel[icellkernel+3*inputStride_3].x * temp0[i].x + kernel[icellkernel+4*inputStride_3].x * temp1[i].x + kernel[icellkernel+5*inputStride_3].x * temp2[i].x - kernel[icellkernel+3*inputStride_3].y * temp0[i].y - kernel[icellkernel+4*inputStride_3].y * temp1[i].y - kernel[icellkernel+5*inputStride_3].y * temp2[i].y; @@ -723,16 +732,24 @@ void main() { barrier(); } - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+2*gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+2*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+3*gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+3*gl_WorkGroupSize.y))]; - if (!zeropad_0){ - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+4*gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+4*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+5*gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+5*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+6*gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+6*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+7*gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+7*gl_WorkGroupSize.y))]; - } + if (passID>0){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_GlobalInvocationID.x/fft_dim_x)*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))/float(fft_dim_full)); + uint index=(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)); + vec2 mult = vec2(cos(angle),-sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; + } + memoryBarrierShared(); + barrier(); + } + if (zeropad_0){ + for (uint i=0; i < 4; i++) + outputs[indexOutput(gl_GlobalInvocationID.x%(fft_dim_x), (gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)*(fft_dim), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]; + } else { + for (uint i=0; i < 8; i++) + outputs[indexOutput(gl_GlobalInvocationID.x%(fft_dim_x), (gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)*(fft_dim), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]; + } memoryBarrierShared(); barrier(); } diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_nonsymmetric_3x3.spv b/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_nonsymmetric_3x3.spv index f153de28e..65f028aa9 100644 Binary files a/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_nonsymmetric_3x3.spv and b/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_nonsymmetric_3x3.spv differ diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_symmetric_2x2.comp b/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_symmetric_2x2.comp index d948e1377..6ab3157e5 100644 --- a/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_symmetric_2x2.comp +++ b/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_symmetric_2x2.comp @@ -5,7 +5,6 @@ const float M_SQRT1_2 = 0.70710678118654752440084436210485; layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;// 32, fft/8, 1: total <1024 layout (constant_id = 4) const uint fft_dim = 2048; - layout (constant_id = 5) const bool inverse = false; layout (constant_id = 6) const bool zeropad_0 = false; layout (constant_id = 7) const bool zeropad_1 = false; @@ -19,9 +18,9 @@ layout (constant_id = 14) const uint outputStride_1 = 1; layout (constant_id = 15) const uint outputStride_2 = 1; layout (constant_id = 16) const uint outputStride_3 = 1; layout (constant_id = 17) const uint outputStride_4 = 1; -layout (constant_id = 18) const uint radixStride_0 = 1; -layout (constant_id = 19) const uint radixStride_1 = 1; -layout (constant_id = 20) const uint radixStride_2 = 1; +layout (constant_id = 18) const uint fft_dim_full = 2048; +layout (constant_id = 19) const uint stageStartSize = 2048; +layout (constant_id = 20) const uint fft_dim_x = 2048; layout (constant_id = 21) const uint numStages = 1; layout (constant_id = 22) const uint stageRadix_0 = 8; layout (constant_id = 23) const uint stageRadix_1 = 8; @@ -31,6 +30,10 @@ layout (constant_id = 26) const bool ratioDirection_0 = false; layout (constant_id = 27) const bool ratioDirection_1 = true; layout (constant_id = 28) const uint inputOffset = 0; layout (constant_id = 29) const uint outputOffset = 0; +layout (constant_id = 30) const uint passID = 0; +const uint radixStride_0 = fft_dim/2; +const uint radixStride_1 = fft_dim/4; +const uint radixStride_2 = fft_dim/8; layout(push_constant) uniform PushConsts { @@ -199,24 +202,30 @@ void main() { vec2 temp1[8]; for(uint coordinate=0; coordinate<2; coordinate++){ - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+gl_WorkGroupSize.y, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+2*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+2*gl_WorkGroupSize.y, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+3*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+3*gl_WorkGroupSize.y, coordinate)]; if (zeropad_0){ - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+4*gl_WorkGroupSize.y))]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+5*gl_WorkGroupSize.y))]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+6*gl_WorkGroupSize.y))]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+7*gl_WorkGroupSize.y))]=vec2(0,0); + for (uint i=0; i < 4; i++) + sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x%(fft_dim_x), (gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)*(fft_dim), coordinate)]; + for (uint i=4; i < 8; i++) + sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]=vec2(0,0); } else { - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+4*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+4*gl_WorkGroupSize.y, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+5*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+5*gl_WorkGroupSize.y, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+6*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+6*gl_WorkGroupSize.y, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+7*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+7*gl_WorkGroupSize.y, coordinate)]; + for (uint i=0; i < 8; i++) + sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x%(fft_dim_x), (gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)*(fft_dim), coordinate)]; } memoryBarrierShared(); barrier(); + if (passID>0){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_GlobalInvocationID.x/fft_dim_x)*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))/float(fft_dim_full)); + uint index=(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)); + vec2 mult = vec2(cos(angle),sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; + } + memoryBarrierShared(); + barrier(); + } + uint stageSize=1; float stageAngle=M_PI; @@ -451,7 +460,7 @@ void main() { //2x2 symmetric convolution for a 2d vector for (uint i=0; i<8; i++){ - uint icellkernel= indexInput(gl_GlobalInvocationID.x, gl_GlobalInvocationID.y+i*gl_WorkGroupSize.y, 0); + uint icellkernel= indexInput(gl_GlobalInvocationID.x%(fft_dim_x), (gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)%(stageStartSize)+(gl_GlobalInvocationID.x/fft_dim_x/stageStartSize)*(fft_dim), 0); float temp_spin_real0 = kernel[icellkernel ].x * temp0[i].x + kernel[icellkernel+inputStride_3].x * temp1[i].x - kernel[icellkernel ].y * temp0[i].y - kernel[icellkernel+inputStride_3].y * temp1[i].y; float temp_spin_imag0 = kernel[icellkernel ].x * temp0[i].y + kernel[icellkernel+inputStride_3].x * temp1[i].y + kernel[icellkernel ].y * temp0[i].x + kernel[icellkernel+inputStride_3].y * temp1[i].x; float temp_spin_real1 = kernel[icellkernel+inputStride_3].x * temp0[i].x + kernel[icellkernel+2*inputStride_3].x * temp1[i].x - kernel[icellkernel+inputStride_3].y * temp0[i].y - kernel[icellkernel+2*inputStride_3].y * temp1[i].y; @@ -695,17 +704,25 @@ void main() { memoryBarrierShared(); barrier(); } - - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+2*gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+2*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+3*gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+3*gl_WorkGroupSize.y))]; - if (!zeropad_0){ - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+4*gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+4*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+5*gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+5*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+6*gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+6*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+7*gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+7*gl_WorkGroupSize.y))]; - } + + if (passID>0){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_GlobalInvocationID.x/fft_dim_x)*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))/float(fft_dim_full)); + uint index=(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)); + vec2 mult = vec2(cos(angle),-sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; + } + memoryBarrierShared(); + barrier(); + } + if (zeropad_0){ + for (uint i=0; i < 4; i++) + outputs[indexOutput(gl_GlobalInvocationID.x%(fft_dim_x), (gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)*(fft_dim), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]; + } else { + for (uint i=0; i < 8; i++) + outputs[indexOutput(gl_GlobalInvocationID.x%(fft_dim_x), (gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)*(fft_dim), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]; + } memoryBarrierShared(); barrier(); } diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_symmetric_2x2.spv b/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_symmetric_2x2.spv index 47b63b881..b3acc6279 100644 Binary files a/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_symmetric_2x2.spv and b/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_symmetric_2x2.spv differ diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_symmetric_3x3.comp b/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_symmetric_3x3.comp index 46e352ef8..5a86c0526 100644 --- a/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_symmetric_3x3.comp +++ b/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_symmetric_3x3.comp @@ -5,7 +5,6 @@ const float M_SQRT1_2 = 0.70710678118654752440084436210485; layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;// 32, fft/8, 1: total <1024 layout (constant_id = 4) const uint fft_dim = 2048; - layout (constant_id = 5) const bool inverse = false; layout (constant_id = 6) const bool zeropad_0 = false; layout (constant_id = 7) const bool zeropad_1 = false; @@ -19,9 +18,9 @@ layout (constant_id = 14) const uint outputStride_1 = 1; layout (constant_id = 15) const uint outputStride_2 = 1; layout (constant_id = 16) const uint outputStride_3 = 1; layout (constant_id = 17) const uint outputStride_4 = 1; -layout (constant_id = 18) const uint radixStride_0 = 1; -layout (constant_id = 19) const uint radixStride_1 = 1; -layout (constant_id = 20) const uint radixStride_2 = 1; +layout (constant_id = 18) const uint fft_dim_full = 2048; +layout (constant_id = 19) const uint stageStartSize = 2048; +layout (constant_id = 20) const uint fft_dim_x = 2048; layout (constant_id = 21) const uint numStages = 1; layout (constant_id = 22) const uint stageRadix_0 = 8; layout (constant_id = 23) const uint stageRadix_1 = 8; @@ -31,6 +30,10 @@ layout (constant_id = 26) const bool ratioDirection_0 = false; layout (constant_id = 27) const bool ratioDirection_1 = true; layout (constant_id = 28) const uint inputOffset = 0; layout (constant_id = 29) const uint outputOffset = 0; +layout (constant_id = 30) const uint passID = 0; +const uint radixStride_0 = fft_dim/2; +const uint radixStride_1 = fft_dim/4; +const uint radixStride_2 = fft_dim/8; layout(push_constant) uniform PushConsts { @@ -197,27 +200,33 @@ void main() { vec2 temp0[8]; vec2 temp1[8]; vec2 temp2[8]; - + for(uint coordinate=0; coordinate<3; coordinate++){ - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+gl_WorkGroupSize.y, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+2*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+2*gl_WorkGroupSize.y, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+3*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+3*gl_WorkGroupSize.y, coordinate)]; if (zeropad_0){ - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+4*gl_WorkGroupSize.y))]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+5*gl_WorkGroupSize.y))]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+6*gl_WorkGroupSize.y))]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+7*gl_WorkGroupSize.y))]=vec2(0,0); + for (uint i=0; i < 4; i++) + sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x%(fft_dim_x), (gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)*(fft_dim), coordinate)]; + for (uint i=4; i < 8; i++) + sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]=vec2(0,0); } else { - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+4*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+4*gl_WorkGroupSize.y, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+5*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+5*gl_WorkGroupSize.y, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+6*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+6*gl_WorkGroupSize.y, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+7*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x, gl_LocalInvocationID.y+7*gl_WorkGroupSize.y, coordinate)]; + for (uint i=0; i < 8; i++) + sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]=inputs[indexInput(gl_GlobalInvocationID.x%(fft_dim_x), (gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)*(fft_dim), coordinate)]; } memoryBarrierShared(); barrier(); + if (passID>0){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_GlobalInvocationID.x/fft_dim_x)*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))/float(fft_dim_full)); + uint index=(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)); + vec2 mult = vec2(cos(angle),sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; + } + memoryBarrierShared(); + barrier(); + } uint stageSize=1; + float stageAngle=M_PI; //0-numStages stage @@ -463,7 +472,7 @@ void main() { //3x3 symmetric convolution for a 3d vector for (uint i=0; i<8; i++){ - uint icellkernel= indexInput(gl_GlobalInvocationID.x, gl_GlobalInvocationID.y+i*gl_WorkGroupSize.y, 0); + uint icellkernel= indexInput(gl_GlobalInvocationID.x%(fft_dim_x), (gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)%(stageStartSize)+(gl_GlobalInvocationID.x/fft_dim_x/stageStartSize)*(fft_dim), 0); float temp_spin_real0 = kernel[icellkernel ].x * temp0[i].x + kernel[icellkernel+inputStride_3].x * temp1[i].x + kernel[icellkernel+2*inputStride_3].x * temp2[i].x - kernel[icellkernel ].y * temp0[i].y - kernel[icellkernel+inputStride_3].y * temp1[i].y - kernel[icellkernel+2*inputStride_3].y * temp2[i].y; float temp_spin_imag0 = kernel[icellkernel ].x * temp0[i].y + kernel[icellkernel+inputStride_3].x * temp1[i].y + kernel[icellkernel+2*inputStride_3].x * temp2[i].y + kernel[icellkernel ].y * temp0[i].x + kernel[icellkernel+inputStride_3].y * temp1[i].x + kernel[icellkernel+2*inputStride_3].y * temp2[i].x; float temp_spin_real1 = kernel[icellkernel+inputStride_3].x * temp0[i].x + kernel[icellkernel+3*inputStride_3].x * temp1[i].x + kernel[icellkernel+4*inputStride_3].x * temp2[i].x - kernel[icellkernel+inputStride_3].y * temp0[i].y - kernel[icellkernel+3*inputStride_3].y * temp1[i].y - kernel[icellkernel+4*inputStride_3].y * temp2[i].y; @@ -636,6 +645,7 @@ void main() { } } + memoryBarrierShared(); barrier(); switch(current_radix){ @@ -721,17 +731,24 @@ void main() { memoryBarrierShared(); barrier(); } - - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+2*gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+2*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+3*gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+3*gl_WorkGroupSize.y))]; - if (!zeropad_0){ - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+4*gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+4*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+5*gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+5*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+6*gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+6*gl_WorkGroupSize.y))]; - outputs[indexOutput(gl_GlobalInvocationID.x, (gl_LocalInvocationID.y+7*gl_WorkGroupSize.y), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+7*gl_WorkGroupSize.y))]; - } + if (passID>0){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_GlobalInvocationID.x/fft_dim_x)*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))/float(fft_dim_full)); + uint index=(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)); + vec2 mult = vec2(cos(angle),-sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; + } + memoryBarrierShared(); + barrier(); + } + if (zeropad_0){ + for (uint i=0; i < 4; i++) + outputs[indexOutput(gl_GlobalInvocationID.x%(fft_dim_x), (gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)*(fft_dim), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]; + } else { + for (uint i=0; i < 8; i++) + outputs[indexOutput(gl_GlobalInvocationID.x%(fft_dim_x), (gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/fft_dim_x)*(fft_dim), coordinate)]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]; + } memoryBarrierShared(); barrier(); } diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_symmetric_3x3.spv b/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_symmetric_3x3.spv index 716a9c3bd..a7310a0fa 100644 Binary files a/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_symmetric_3x3.spv and b/core/thirdparty/VkFFT/shaders/vkFFT_grouped_convolution_symmetric_3x3.spv differ diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_single_c2c.comp b/core/thirdparty/VkFFT/shaders/vkFFT_single_c2c.comp index 535fced67..0ed57c26e 100644 --- a/core/thirdparty/VkFFT/shaders/vkFFT_single_c2c.comp +++ b/core/thirdparty/VkFFT/shaders/vkFFT_single_c2c.comp @@ -3,10 +3,8 @@ const float M_PI = 3.1415926535897932384626433832795; const float M_SQRT1_2 = 0.70710678118654752440084436210485; - -layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;// 32, fft/8, 1: total <1024 layout (constant_id = 4) const uint fft_dim = 2048; - layout (constant_id = 5) const bool inverse = false; layout (constant_id = 6) const bool zeropad_0 = false; layout (constant_id = 7) const bool zeropad_1 = false; @@ -20,9 +18,9 @@ layout (constant_id = 14) const uint outputStride_1 = 1; layout (constant_id = 15) const uint outputStride_2 = 1; layout (constant_id = 16) const uint outputStride_3 = 1; layout (constant_id = 17) const uint outputStride_4 = 1; -layout (constant_id = 18) const uint radixStride_0 = 1; -layout (constant_id = 19) const uint radixStride_1 = 1; -layout (constant_id = 20) const uint radixStride_2 = 1; +layout (constant_id = 18) const uint fft_dim_full = 2048; +layout (constant_id = 19) const uint stageStartSize = 2048; +layout (constant_id = 20) const uint fft_dim_x = 2048; layout (constant_id = 21) const uint numStages = 1; layout (constant_id = 22) const uint stageRadix_0 = 8; layout (constant_id = 23) const uint stageRadix_1 = 8; @@ -32,6 +30,10 @@ layout (constant_id = 26) const bool ratioDirection_0 = false; layout (constant_id = 27) const bool ratioDirection_1 = true; layout (constant_id = 28) const uint inputOffset = 0; layout (constant_id = 29) const uint outputOffset = 0; +layout (constant_id = 30) const uint passID = 0; +const uint radixStride_0 = fft_dim/2; +const uint radixStride_1 = fft_dim/4; +const uint radixStride_2 = fft_dim/8; layout(push_constant) uniform PushConsts { @@ -48,10 +50,10 @@ layout(std430, binding = 1) buffer Data2 { vec2 outputs[]; }; uint indexInput(uint index) { - return inputOffset+index * inputStride_0 + gl_GlobalInvocationID.y * inputStride_1 + gl_GlobalInvocationID.z * inputStride_2 + consts.coordinate * inputStride_3 + consts.batchID * inputStride_4; + return inputOffset+index * inputStride_0 + gl_WorkGroupID.y * inputStride_1 + gl_GlobalInvocationID.z * inputStride_2 + consts.coordinate * inputStride_3 + consts.batchID * inputStride_4; } uint indexOutput(uint index) { - return outputOffset+index * outputStride_0 + gl_GlobalInvocationID.y * outputStride_1 + gl_GlobalInvocationID.z * outputStride_2 + consts.coordinate * outputStride_3 + consts.batchID * outputStride_4; + return outputOffset+index * outputStride_0 + gl_WorkGroupID.y * outputStride_1 + gl_GlobalInvocationID.z * outputStride_2 + consts.coordinate * outputStride_3 + consts.batchID * outputStride_4; } uint positionShuffle(uint pos, uint ratio, bool ratioDirection ) { return (((pos)/(ratio))+((pos)%(ratio))*(fft_dim)); @@ -191,46 +193,32 @@ void radix8(inout vec2 values[8], inout vec2 w) { shared vec2 sdata[gl_WorkGroupSize.y*fft_dim];// gl_WorkGroupSize.x - fft size, gl_WorkGroupSize.y - grouped consequential ffts void main() { - + if ((zeropad_0)&&(!inverse)){ - if (ratioDirection_0){ - for (uint i=0; i<8; i++){ - uint pos=positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - if (pos%fft_dim0)&&(!inverse)){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_WorkGroupID.x*gl_WorkGroupSize.y+gl_LocalInvocationID.y)*(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x))/float(fft_dim_full)); + uint index=(fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)); + vec2 mult = vec2(cos(angle),sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; + } + memoryBarrierShared(); + barrier(); + } uint stageSize=1; float stageAngle=(inverse) ? -M_PI : M_PI; //0-numStages stage @@ -435,38 +423,24 @@ void main() { memoryBarrierShared(); barrier(); } - - if ((zeropad_0)&&(inverse)){ - if (ratioDirection_1){ - outputs[indexOutput(gl_LocalInvocationID.x)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]; - - }else{ - for (uint i=0; i<8; i++){ - uint pos = positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_1, ratioDirection_1); - if (pos%fft_dim0)&&(inverse)){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_WorkGroupID.x*gl_WorkGroupSize.y+gl_LocalInvocationID.y)*(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x))/float(fft_dim_full)); + uint index=(fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)); + vec2 mult = vec2(cos(angle),-sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; } + memoryBarrierShared(); + barrier(); + } + if ((zeropad_0)&&(inverse)){ + for (uint i=0; i < 4; i++) + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x+gl_LocalInvocationID.y*fft_dim+(gl_WorkGroupID.x)*gl_WorkGroupSize.y*fft_dim)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]; + } else { - if (ratioDirection_1){ - outputs[indexOutput(gl_LocalInvocationID.x)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]; - - }else{ - - for (uint i=0; i<8; i++) - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]=sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_1, ratioDirection_1)]; - - } + for (uint i=0; i < 8; i++) + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x+gl_LocalInvocationID.y*fft_dim+(gl_WorkGroupID.x)*gl_WorkGroupSize.y*fft_dim)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]; } diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_single_c2c.spv b/core/thirdparty/VkFFT/shaders/vkFFT_single_c2c.spv index 7e9a45b5c..e7f1cccf4 100644 Binary files a/core/thirdparty/VkFFT/shaders/vkFFT_single_c2c.spv and b/core/thirdparty/VkFFT/shaders/vkFFT_single_c2c.spv differ diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_single_c2c_afterR2C.comp b/core/thirdparty/VkFFT/shaders/vkFFT_single_c2c_afterR2C.comp deleted file mode 100644 index e0280b115..000000000 --- a/core/thirdparty/VkFFT/shaders/vkFFT_single_c2c_afterR2C.comp +++ /dev/null @@ -1,485 +0,0 @@ -#version 450 - -const float M_PI = 3.1415926535897932384626433832795; -const float M_SQRT1_2 = 0.70710678118654752440084436210485; - -layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; -layout (constant_id = 4) const uint fft_dim = 2048; - -layout (constant_id = 5) const bool inverse = false; -layout (constant_id = 6) const bool zeropad_0 = false; -layout (constant_id = 7) const bool zeropad_1 = false; -layout (constant_id = 8) const uint inputStride_0 = 1; -layout (constant_id = 9) const uint inputStride_1 = 1; -layout (constant_id = 10) const uint inputStride_2 = 1; -layout (constant_id = 11) const uint inputStride_3 = 1; -layout (constant_id = 12) const uint inputStride_4 = 1; -layout (constant_id = 13) const uint outputStride_0 = 1; -layout (constant_id = 14) const uint outputStride_1 = 1; -layout (constant_id = 15) const uint outputStride_2 = 1; -layout (constant_id = 16) const uint outputStride_3 = 1; -layout (constant_id = 17) const uint outputStride_4 = 1; -layout (constant_id = 18) const uint radixStride_0 = 1; -layout (constant_id = 19) const uint radixStride_1 = 1; -layout (constant_id = 20) const uint radixStride_2 = 1; -layout (constant_id = 21) const uint numStages = 1; -layout (constant_id = 22) const uint stageRadix_0 = 8; -layout (constant_id = 23) const uint stageRadix_1 = 8; -layout (constant_id = 24) const uint ratio_0 = 8; -layout (constant_id = 25) const uint ratio_1 = 8; -layout (constant_id = 26) const bool ratioDirection_0 = false; -layout (constant_id = 27) const bool ratioDirection_1 = true; -layout (constant_id = 28) const uint inputOffset = 0; -layout (constant_id = 29) const uint outputOffset = 0; - -layout(push_constant) uniform PushConsts -{ - uint coordinate; - uint batchID; -} consts; - - -layout(std430, binding = 0) buffer Data { - vec2 inputs[]; -}; - -layout(std430, binding = 1) buffer Data2 { - vec2 outputs[]; -}; -uint indexInput(uint index) { - return inputOffset+index * inputStride_0 + gl_GlobalInvocationID.y * inputStride_1 + gl_GlobalInvocationID.z * inputStride_2 + consts.coordinate * inputStride_3+ consts.batchID * inputStride_4; -} -uint indexOutput(uint index) { - return outputOffset+index * outputStride_0 + gl_GlobalInvocationID.y * outputStride_1 + gl_GlobalInvocationID.z * outputStride_2 + consts.coordinate * outputStride_3+ consts.batchID * outputStride_4; -} -uint positionShuffle(uint pos, uint ratio, bool ratioDirection ) { - return (((pos)/(ratio))+((pos)%(ratio))*(fft_dim)); -} -void radix2(inout vec2 values[2], vec2 w) { - vec2 temp; - temp.x=values[1].x*w.x-values[1].y*w.y; - temp.y=values[1].y*w.x+values[1].x*w.y; - values[1]=values[0]-temp; - values[0]=values[0]+temp; -} - -void radix4(inout vec2 values[4],inout vec2 w) { - - //DIF 1st stage with double angle - vec2 temp; - temp.x=values[2].x*w.x-values[2].y*w.y; - temp.y=values[2].y*w.x+values[2].x*w.y; - values[2]=values[0]-temp; - values[0]=values[0]+temp; - - temp.x=values[3].x*w.x-values[3].y*w.y; - temp.y=values[3].y*w.x+values[3].x*w.y; - values[3]=values[1]-temp; - values[1]=values[1]+temp; - - //DIF 2nd stage with half angle - w = normalize(w + vec2(1.0, 0.0)); - - temp.x=values[1].x*w.x-values[1].y*w.y; - temp.y=values[1].y*w.x+values[1].x*w.y; - values[1]=values[0]-temp; - values[0]=values[0]+temp; - - w = (inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); - - temp.x=values[3].x*w.x-values[3].y*w.y; - temp.y=values[3].y*w.x+values[3].x*w.y; - values[3]=values[2]-temp; - values[2]=values[2]+temp; - - temp = values[1]; - values[1]=values[2]; - values[2]=temp; -} - -void radix8(inout vec2 values[8], inout vec2 w) { - //DIF 1st stage with quadruple angle - - vec2 temp; - temp.x=values[4].x*w.x-values[4].y*w.y; - temp.y=values[4].y*w.x+values[4].x*w.y; - values[4]=values[0]-temp; - values[0]=values[0]+temp; - - temp.x=values[5].x*w.x-values[5].y*w.y; - temp.y=values[5].y*w.x+values[5].x*w.y; - values[5]=values[1]-temp; - values[1]=values[1]+temp; - - temp.x=values[6].x*w.x-values[6].y*w.y; - temp.y=values[6].y*w.x+values[6].x*w.y; - values[6]=values[2]-temp; - values[2]=values[2]+temp; - - temp.x=values[7].x*w.x-values[7].y*w.y; - temp.y=values[7].y*w.x+values[7].x*w.y; - values[7]=values[3]-temp; - values[3]=values[3]+temp; - - //DIF 2nd stage with double angle - w = normalize(w + vec2(1.0, 0.0)); - - temp.x=values[2].x*w.x-values[2].y*w.y; - temp.y=values[2].y*w.x+values[2].x*w.y; - values[2]=values[0]-temp; - values[0]=values[0]+temp; - - temp.x=values[3].x*w.x-values[3].y*w.y; - temp.y=values[3].y*w.x+values[3].x*w.y; - values[3]=values[1]-temp; - values[1]=values[1]+temp; - - vec2 iw = (inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); - - temp.x=values[6].x*iw.x-values[6].y*iw.y; - temp.y=values[6].y*iw.x+values[6].x*iw.y; - values[6]=values[4]-temp; - values[4]=values[4]+temp; - - temp.x=values[7].x*iw.x-values[7].y*iw.y; - temp.y=values[7].y*iw.x+values[7].x*iw.y; - values[7]=values[5]-temp; - values[5]=values[5]+temp; - - //DIF 3rd stage with angle - w = normalize(w + vec2(1.0, 0.0)); - - temp.x=values[1].x*w.x-values[1].y*w.y; - temp.y=values[1].y*w.x+values[1].x*w.y; - values[1]=values[0]-temp; - values[0]=values[0]+temp; - - iw = (inverse) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); - - temp.x=values[3].x*iw.x-values[3].y*iw.y; - temp.y=values[3].y*iw.x+values[3].x*iw.y; - values[3]=values[2]-temp; - values[2]=values[2]+temp; - - iw.x=(inverse) ? w.x*M_SQRT1_2+w.y*M_SQRT1_2 : w.x*M_SQRT1_2-w.y*M_SQRT1_2; - iw.y=(inverse) ? w.y*M_SQRT1_2-w.x*M_SQRT1_2 : w.y*M_SQRT1_2+w.x*M_SQRT1_2; - - temp.x=values[5].x*iw.x-values[5].y*iw.y; - temp.y=values[5].y*iw.x+values[5].x*iw.y; - values[5]=values[4]-temp; - values[4]=values[4]+temp; - - w = (inverse) ? vec2(iw.y, -iw.x) : vec2(-iw.y, iw.x); - - temp.x=values[7].x*w.x-values[7].y*w.y; - temp.y=values[7].y*w.x+values[7].x*w.y; - values[7]=values[6]-temp; - values[6]=values[6]+temp; - - temp = values[1]; - values[1]=values[4]; - values[4]=temp; - - temp = values[3]; - values[3]=values[6]; - values[6]=temp; - -} - -shared vec2 sdata[gl_WorkGroupSize.y*fft_dim]; - -void main() { - - if (zeropad_0){ - if (ratioDirection_0){ - if (gl_WorkGroupID.y < gl_NumWorkGroups.y-1){ - for (uint i=0; i<8; i++){ - uint pos = positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - if (pos%fft_dim0) - return; - sdata[gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]; - sdata[(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]; - sdata[(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]; - sdata[(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=vec2(0,0); - } - } else{ - sdata[fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=vec2(0,0); - } - } else { - if (ratioDirection_0){ - if (gl_WorkGroupID.y < gl_NumWorkGroups.y-1){ - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]; - }else{ - if (gl_LocalInvocationID.y>0) - return; - sdata[gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]; - sdata[(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]; - sdata[(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]; - sdata[(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]; - sdata[(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]; - sdata[(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]; - sdata[(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]; - } - } else{ - sdata[fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]; - } - - } - memoryBarrierShared(); - barrier(); - - vec2 temp[8]; - uint stageSize=1; - float stageAngle=(inverse) ? -M_PI : M_PI; - //0-numStages stage - for (uint n=0; n < numStages; n++){ - uint current_radix = (n0)) - return; - - if (ratioDirection_0){ - for (uint i=0; i<8; i++) - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]; - - }else{ - sdata[fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]; - } - - - memoryBarrierShared(); - barrier(); - - vec2 temp[8]; - uint stageSize=1; - float stageAngle=(inverse) ? -M_PI : M_PI; - //0-numStages stage - - for (uint n=0; n < numStages; n++){ - uint current_radix = (n0)&&(!inverse)){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_GlobalInvocationID.x)*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))/float(fft_dim_full)); + uint index=(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)); + vec2 mult = vec2(cos(angle),sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; + } + memoryBarrierShared(); + barrier(); + } + + float stageAngle=(inverse) ? -M_PI : M_PI; + for (uint n=0; n < numStages; n++){ + uint current_radix = (n0)&&(inverse)){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_GlobalInvocationID.x)*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))/float(fft_dim_full)); + uint index=(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)); + vec2 mult = vec2(cos(angle),-sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; + } + memoryBarrierShared(); + barrier(); + } + if ((zeropad_0)&&(inverse)){ + for (uint i=0; i < 4; i++) + outputs[indexOutput(gl_GlobalInvocationID.x%(stageStartSize) + stageStartSize*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/stageStartSize)*(stageStartSize*fft_dim))]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]; + + } else { + + for (uint i=0; i < 8; i++) + outputs[indexOutput(gl_GlobalInvocationID.x%(stageStartSize) + stageStartSize*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y)+(gl_GlobalInvocationID.x/stageStartSize)*(stageStartSize*fft_dim))]=sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x*(gl_LocalInvocationID.y+i*gl_WorkGroupSize.y))]; + + } +} diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_single_c2c_strided.spv b/core/thirdparty/VkFFT/shaders/vkFFT_single_c2c_strided.spv new file mode 100644 index 000000000..496722edd Binary files /dev/null and b/core/thirdparty/VkFFT/shaders/vkFFT_single_c2c_strided.spv differ diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_single_c2r.comp b/core/thirdparty/VkFFT/shaders/vkFFT_single_c2r.comp index a80453279..855770f95 100644 --- a/core/thirdparty/VkFFT/shaders/vkFFT_single_c2r.comp +++ b/core/thirdparty/VkFFT/shaders/vkFFT_single_c2r.comp @@ -3,9 +3,8 @@ const float M_PI = 3.1415926535897932384626433832795; const float M_SQRT1_2 = 0.70710678118654752440084436210485; -layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;// 32, fft/8, 1: total <1024 layout (constant_id = 4) const uint fft_dim = 2048; - layout (constant_id = 5) const bool inverse = false; layout (constant_id = 6) const bool zeropad_0 = false; layout (constant_id = 7) const bool zeropad_1 = false; @@ -19,9 +18,9 @@ layout (constant_id = 14) const uint outputStride_1 = 1; layout (constant_id = 15) const uint outputStride_2 = 1; layout (constant_id = 16) const uint outputStride_3 = 1; layout (constant_id = 17) const uint outputStride_4 = 1; -layout (constant_id = 18) const uint radixStride_0 = 1; -layout (constant_id = 19) const uint radixStride_1 = 1; -layout (constant_id = 20) const uint radixStride_2 = 1; +layout (constant_id = 18) const uint fft_dim_full = 2048; +layout (constant_id = 19) const uint stageStartSize = 2048; +layout (constant_id = 20) const uint fft_dim_x = 2048; layout (constant_id = 21) const uint numStages = 1; layout (constant_id = 22) const uint stageRadix_0 = 8; layout (constant_id = 23) const uint stageRadix_1 = 8; @@ -31,6 +30,10 @@ layout (constant_id = 26) const bool ratioDirection_0 = false; layout (constant_id = 27) const bool ratioDirection_1 = true; layout (constant_id = 28) const uint inputOffset = 0; layout (constant_id = 29) const uint outputOffset = 0; +layout (constant_id = 30) const uint passID = 0; +const uint radixStride_0 = fft_dim/2; +const uint radixStride_1 = fft_dim/4; +const uint radixStride_2 = fft_dim/8; layout(push_constant) uniform PushConsts { diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_single_c2r.spv b/core/thirdparty/VkFFT/shaders/vkFFT_single_c2r.spv index 72446730f..401a81f8c 100644 Binary files a/core/thirdparty/VkFFT/shaders/vkFFT_single_c2r.spv and b/core/thirdparty/VkFFT/shaders/vkFFT_single_c2r.spv differ diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_1x1.comp b/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_1x1.comp index 16e5f0410..4b238a25c 100644 --- a/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_1x1.comp +++ b/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_1x1.comp @@ -3,9 +3,8 @@ const float M_PI = 3.1415926535897932384626433832795; const float M_SQRT1_2 = 0.70710678118654752440084436210485; -layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;// 32, fft/8, 1: total <1024 layout (constant_id = 4) const uint fft_dim = 2048; - layout (constant_id = 5) const bool inverse = false; layout (constant_id = 6) const bool zeropad_0 = false; layout (constant_id = 7) const bool zeropad_1 = false; @@ -19,9 +18,9 @@ layout (constant_id = 14) const uint outputStride_1 = 1; layout (constant_id = 15) const uint outputStride_2 = 1; layout (constant_id = 16) const uint outputStride_3 = 1; layout (constant_id = 17) const uint outputStride_4 = 1; -layout (constant_id = 18) const uint radixStride_0 = 1; -layout (constant_id = 19) const uint radixStride_1 = 1; -layout (constant_id = 20) const uint radixStride_2 = 1; +layout (constant_id = 18) const uint fft_dim_full = 2048; +layout (constant_id = 19) const uint stageStartSize = 2048; +layout (constant_id = 20) const uint fft_dim_x = 2048; layout (constant_id = 21) const uint numStages = 1; layout (constant_id = 22) const uint stageRadix_0 = 8; layout (constant_id = 23) const uint stageRadix_1 = 8; @@ -31,6 +30,10 @@ layout (constant_id = 26) const bool ratioDirection_0 = false; layout (constant_id = 27) const bool ratioDirection_1 = true; layout (constant_id = 28) const uint inputOffset = 0; layout (constant_id = 29) const uint outputOffset = 0; +layout (constant_id = 30) const uint passID = 0; +const uint radixStride_0 = fft_dim/2; +const uint radixStride_1 = fft_dim/4; +const uint radixStride_2 = fft_dim/8; layout(push_constant) uniform PushConsts { @@ -199,45 +202,29 @@ shared vec2 sdata[gl_WorkGroupSize.y*fft_dim];// gl_WorkGroupSize.x - fft size, void main() { vec2 temp0[8]; - if (zeropad_0){ - if (ratioDirection_0){ - for (uint i=0; i<8; i++){ - uint pos=positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - if (pos%fft_dim0)){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_WorkGroupID.x*gl_WorkGroupSize.y+gl_LocalInvocationID.y)*(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x))/float(fft_dim_full)); + uint index=(fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)); + vec2 mult = vec2(cos(angle),sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; + } + memoryBarrierShared(); + barrier(); + } uint stageSize=1; float stageAngle=M_PI; @@ -457,7 +444,7 @@ void main() { vec2 temp0_out[8]; //1x1 convolution for a 1d vector for (uint i=0; i<8; i++){ - uint icellkernel= indexInputKernel(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, batchID); + uint icellkernel= indexInputKernel(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x+gl_LocalInvocationID.y*fft_dim+(gl_WorkGroupID.x)*gl_WorkGroupSize.y*fft_dim, batchID); temp0_out[i].x = kernel[icellkernel ].x * temp0[i].x - kernel[icellkernel ].y * temp0[i].y; temp0_out[i].y = kernel[icellkernel ].x * temp0[i].y + kernel[icellkernel ].y * temp0[i].x; @@ -681,54 +668,24 @@ void main() { memoryBarrierShared(); barrier(); } - if (zeropad_0){ - if (ratioDirection_1){ - outputs[indexOutput(gl_LocalInvocationID.x, batchID)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, batchID)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, batchID)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, batchID)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]; - - }else{ - - for (uint i=0; i<8; i++){ - uint pos = positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_1, ratioDirection_1); - if (pos%fft_dim0)){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_WorkGroupID.x*gl_WorkGroupSize.y+gl_LocalInvocationID.y)*(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x))/float(fft_dim_full)); + uint index=(fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)); + vec2 mult = vec2(cos(angle),-sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; } + memoryBarrierShared(); + barrier(); + } + if ((zeropad_0)){ + for (uint i=0; i < 4; i++) + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x+gl_LocalInvocationID.y*fft_dim+(gl_WorkGroupID.x)*gl_WorkGroupSize.y*fft_dim, batchID)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]; } else { - if (ratioDirection_1){ - outputs[indexOutput(gl_LocalInvocationID.x, batchID)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, batchID)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, batchID)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, batchID)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, batchID)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, batchID)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, batchID)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, batchID)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]; - }else{ - - for (uint i=0; i<8; i++) - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, batchID)]=sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_1, ratioDirection_1)]; - - } - } - if (ratioDirection_1){ - outputs[indexOutput(gl_LocalInvocationID.x, batchID)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, batchID)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, batchID)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, batchID)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, batchID)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, batchID)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, batchID)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, batchID)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]; - - }else{ - - for (uint i=0; i<8; i++) - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, batchID)]=sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_1, ratioDirection_1)]; - + for (uint i=0; i < 8; i++) + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x+gl_LocalInvocationID.y*fft_dim+(gl_WorkGroupID.x)*gl_WorkGroupSize.y*fft_dim, batchID)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]; } memoryBarrierShared(); diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_1x1.spv b/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_1x1.spv index c19f750f3..aace272d2 100644 Binary files a/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_1x1.spv and b/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_1x1.spv differ diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_afterR2C_1x1.comp b/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_afterR2C_1x1.comp deleted file mode 100644 index 6e9db7d73..000000000 --- a/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_afterR2C_1x1.comp +++ /dev/null @@ -1,766 +0,0 @@ -#version 450 - -const float M_PI = 3.1415926535897932384626433832795; -const float M_SQRT1_2 = 0.70710678118654752440084436210485; - -layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; -layout (constant_id = 4) const uint fft_dim = 2048; - -layout (constant_id = 5) const bool inverse = false; -layout (constant_id = 6) const bool zeropad_0 = false; -layout (constant_id = 7) const bool zeropad_1 = false; -layout (constant_id = 8) const uint inputStride_0 = 1; -layout (constant_id = 9) const uint inputStride_1 = 1; -layout (constant_id = 10) const uint inputStride_2 = 1; -layout (constant_id = 11) const uint inputStride_3 = 1; -layout (constant_id = 12) const uint inputStride_4 = 1; -layout (constant_id = 13) const uint outputStride_0 = 1; -layout (constant_id = 14) const uint outputStride_1 = 1; -layout (constant_id = 15) const uint outputStride_2 = 1; -layout (constant_id = 16) const uint outputStride_3 = 1; -layout (constant_id = 17) const uint outputStride_4 = 1; -layout (constant_id = 18) const uint radixStride_0 = 1; -layout (constant_id = 19) const uint radixStride_1 = 1; -layout (constant_id = 20) const uint radixStride_2 = 1; -layout (constant_id = 21) const uint numStages = 1; -layout (constant_id = 22) const uint stageRadix_0 = 8; -layout (constant_id = 23) const uint stageRadix_1 = 8; -layout (constant_id = 24) const uint ratio_0 = 8; -layout (constant_id = 25) const uint ratio_1 = 8; -layout (constant_id = 26) const bool ratioDirection_0 = false; -layout (constant_id = 27) const bool ratioDirection_1 = true; -layout (constant_id = 28) const uint inputOffset = 0; -layout (constant_id = 29) const uint outputOffset = 0; - -layout(push_constant) uniform PushConsts -{ - uint coordinate; - uint batch; -} consts; - -layout(std430, binding = 0) buffer Data { - vec2 inputs[]; -}; - -layout(std430, binding = 1) buffer Data2 { - vec2 outputs[]; -}; - -layout(std430, binding = 2) readonly buffer Kernel_FFT { - vec2 kernel []; -}; -uint indexInput(uint index) { - return inputOffset+index * inputStride_0 + gl_GlobalInvocationID.y * inputStride_1 + gl_GlobalInvocationID.z * inputStride_2 + consts.coordinate * inputStride_3; -} -uint indexOutput(uint index, uint batchID) { - return outputOffset+index * outputStride_0 + gl_GlobalInvocationID.y * outputStride_1 + gl_GlobalInvocationID.z * outputStride_2 + consts.coordinate * outputStride_3 + batchID * outputStride_4; -} - -uint indexInputKernel(uint index, uint batchID) { - return inputOffset+index * inputStride_0 + gl_GlobalInvocationID.y * inputStride_1 + gl_GlobalInvocationID.z * inputStride_2 + consts.coordinate * inputStride_3 + batchID * inputStride_4; -} - -uint positionShuffle(uint pos, uint ratio, bool ratioDirection ) { - return (((pos)/(ratio))+((pos)%(ratio))*(fft_dim)); -} -void radix2(inout vec2 values[2], vec2 w) { - vec2 temp; - temp.x=values[1].x*w.x-values[1].y*w.y; - temp.y=values[1].y*w.x+values[1].x*w.y; - values[1]=values[0]-temp; - values[0]=values[0]+temp; -} - -void radix4(inout vec2 values[4],inout vec2 w, float inverse) { - - //DIF 1st stage with double angle - vec2 temp; - temp.x=values[2].x*w.x-values[2].y*w.y; - temp.y=values[2].y*w.x+values[2].x*w.y; - values[2]=values[0]-temp; - values[0]=values[0]+temp; - - temp.x=values[3].x*w.x-values[3].y*w.y; - temp.y=values[3].y*w.x+values[3].x*w.y; - values[3]=values[1]-temp; - values[1]=values[1]+temp; - - //DIF 2nd stage with half angle - w = normalize(w + vec2(1.0, 0.0)); - - temp.x=values[1].x*w.x-values[1].y*w.y; - temp.y=values[1].y*w.x+values[1].x*w.y; - values[1]=values[0]-temp; - values[0]=values[0]+temp; - - w = (inverse < 0 ) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); - - temp.x=values[3].x*w.x-values[3].y*w.y; - temp.y=values[3].y*w.x+values[3].x*w.y; - values[3]=values[2]-temp; - values[2]=values[2]+temp; - - temp = values[1]; - values[1]=values[2]; - values[2]=temp; -} - -void radix8(inout vec2 values[8], inout vec2 w, float inverse) { - //DIF 1st stage with quadruple angle - - vec2 temp; - temp.x=values[4].x*w.x-values[4].y*w.y; - temp.y=values[4].y*w.x+values[4].x*w.y; - values[4]=values[0]-temp; - values[0]=values[0]+temp; - - temp.x=values[5].x*w.x-values[5].y*w.y; - temp.y=values[5].y*w.x+values[5].x*w.y; - values[5]=values[1]-temp; - values[1]=values[1]+temp; - - temp.x=values[6].x*w.x-values[6].y*w.y; - temp.y=values[6].y*w.x+values[6].x*w.y; - values[6]=values[2]-temp; - values[2]=values[2]+temp; - - temp.x=values[7].x*w.x-values[7].y*w.y; - temp.y=values[7].y*w.x+values[7].x*w.y; - values[7]=values[3]-temp; - values[3]=values[3]+temp; - - //DIF 2nd stage with double angle - w = normalize(w + vec2(1.0, 0.0)); - - temp.x=values[2].x*w.x-values[2].y*w.y; - temp.y=values[2].y*w.x+values[2].x*w.y; - values[2]=values[0]-temp; - values[0]=values[0]+temp; - - temp.x=values[3].x*w.x-values[3].y*w.y; - temp.y=values[3].y*w.x+values[3].x*w.y; - values[3]=values[1]-temp; - values[1]=values[1]+temp; - - vec2 iw = (inverse < 0) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); - - temp.x=values[6].x*iw.x-values[6].y*iw.y; - temp.y=values[6].y*iw.x+values[6].x*iw.y; - values[6]=values[4]-temp; - values[4]=values[4]+temp; - - temp.x=values[7].x*iw.x-values[7].y*iw.y; - temp.y=values[7].y*iw.x+values[7].x*iw.y; - values[7]=values[5]-temp; - values[5]=values[5]+temp; - - //DIF 3rd stage with angle - w = normalize(w + vec2(1.0, 0.0)); - - temp.x=values[1].x*w.x-values[1].y*w.y; - temp.y=values[1].y*w.x+values[1].x*w.y; - values[1]=values[0]-temp; - values[0]=values[0]+temp; - - iw = (inverse < 0) ? vec2(w.y, -w.x) : vec2(-w.y, w.x); - - temp.x=values[3].x*iw.x-values[3].y*iw.y; - temp.y=values[3].y*iw.x+values[3].x*iw.y; - values[3]=values[2]-temp; - values[2]=values[2]+temp; - - iw.x=(inverse < 0) ? w.x*M_SQRT1_2+w.y*M_SQRT1_2 : w.x*M_SQRT1_2-w.y*M_SQRT1_2; - iw.y=(inverse < 0) ? w.y*M_SQRT1_2-w.x*M_SQRT1_2 : w.y*M_SQRT1_2+w.x*M_SQRT1_2; - - temp.x=values[5].x*iw.x-values[5].y*iw.y; - temp.y=values[5].y*iw.x+values[5].x*iw.y; - values[5]=values[4]-temp; - values[4]=values[4]+temp; - - w = (inverse < 0) ? vec2(iw.y, -iw.x) : vec2(-iw.y, iw.x); - - temp.x=values[7].x*w.x-values[7].y*w.y; - temp.y=values[7].y*w.x+values[7].x*w.y; - values[7]=values[6]-temp; - values[6]=values[6]+temp; - - temp = values[1]; - values[1]=values[4]; - values[4]=temp; - - temp = values[3]; - values[3]=values[6]; - values[6]=temp; - -} - -shared vec2 sdata[gl_WorkGroupSize.y*fft_dim];// gl_WorkGroupSize.x - fft size, gl_WorkGroupSize.y - grouped consequential ffts - -void main() { - - vec2 temp0[8]; - if (zeropad_0){ - if (ratioDirection_0){ - if (gl_WorkGroupID.y < gl_NumWorkGroups.y-1){ - for (uint i=0; i<8; i++){ - uint pos = positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - if (pos%fft_dim0) - return; - sdata[gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]; - sdata[(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]; - sdata[(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]; - sdata[(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=vec2(0,0); - } - } else{ - sdata[fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=vec2(0,0); - } - } else { - if (ratioDirection_0){ - if (gl_WorkGroupID.y < gl_NumWorkGroups.y-1){ - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]; - }else{ - if (gl_LocalInvocationID.y>0) - return; - sdata[gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]; - sdata[(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]; - sdata[(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]; - sdata[(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]; - sdata[(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]; - sdata[(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]; - sdata[(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]; - } - } else{ - sdata[fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]; - } - - } - memoryBarrierShared(); - barrier(); - - uint stageSize=1; - float stageAngle=M_PI; - - //0-numStages stage - for (uint n=0; n < numStages; n++){ - uint current_radix = (n0) - return; - sdata[gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=vec2(0,0); - } - } else{ - sdata[fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=vec2(0,0); - } - } else { - if (ratioDirection_0){ - if (gl_WorkGroupID.y < gl_NumWorkGroups.y-1){ - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, coordinate)]; - }else{ - if (gl_LocalInvocationID.y>0) - return; - sdata[gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, coordinate)]; - } - } else{ - sdata[fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, coordinate)]; - } - - } - memoryBarrierShared(); - barrier(); - - uint stageSize=1; - float stageAngle=M_PI; - - //0-numStages stage - for (uint n=0; n < numStages; n++){ - uint current_radix = (n0) - return; - sdata[gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=vec2(0,0); - } - } else{ - sdata[fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=vec2(0,0); - } - } else { - if (ratioDirection_0){ - if (gl_WorkGroupID.y < gl_NumWorkGroups.y-1){ - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, coordinate)]; - }else{ - if (gl_LocalInvocationID.y>0) - return; - sdata[gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, coordinate)]; - } - } else{ - sdata[fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, coordinate)]; - } - - } - memoryBarrierShared(); - barrier(); - - uint stageSize=1; - float stageAngle=M_PI; - - //0-numStages stage - for (uint n=0; n < numStages; n++){ - uint current_radix = (n0) - return; - sdata[gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=vec2(0,0); - } - } else{ - sdata[fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=vec2(0,0); - } - } else { - if (ratioDirection_0){ - if (gl_WorkGroupID.y < gl_NumWorkGroups.y-1){ - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, coordinate)]; - }else{ - if (gl_LocalInvocationID.y>0) - return; - sdata[gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, coordinate)]; - } - } else{ - sdata[fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, coordinate)]; - } - - } - memoryBarrierShared(); - barrier(); - - uint stageSize=1; - float stageAngle=M_PI; - - //0-numStages stage - for (uint n=0; n < numStages; n++){ - uint current_radix = (n0) - return; - sdata[gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=vec2(0,0); - } - } else{ - sdata[fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=vec2(0,0); - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=vec2(0,0); - } - } else { - if (ratioDirection_0){ - if (gl_WorkGroupID.y < gl_NumWorkGroups.y-1){ - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, coordinate)]; - sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, ratio_0, ratioDirection_0)]=inputs[indexInput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, coordinate)]; - }else{ - if (gl_LocalInvocationID.y>0) - return; - sdata[gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, coordinate)]; - sdata[(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, coordinate)]; - } - } else{ - sdata[fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x]=inputs[indexInput(gl_LocalInvocationID.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, coordinate)]; - sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]=inputs[indexInput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, coordinate)]; - } - - } - memoryBarrierShared(); - barrier(); - - uint stageSize=1; - float stageAngle=M_PI; - - //0-numStages stage - for (uint n=0; n < numStages; n++){ - uint current_radix = (n0)){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_WorkGroupID.x*gl_WorkGroupSize.y+gl_LocalInvocationID.y)*(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x))/float(fft_dim_full)); + uint index=(fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)); + vec2 mult = vec2(cos(angle),sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; + } + memoryBarrierShared(); + barrier(); + } uint stageSize=1; float stageAngle=M_PI; @@ -470,15 +457,15 @@ void main() { //2x2 nonsymmetric convolution for a 2d vector for (uint i=0; i<8; i++){ - uint icellkernel= indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 0); - float temp_spin_real0 = kernel[icellkernel ].x * temp0[i].x + kernel[icellkernel+inputStride_3].x * temp1[i].x - kernel[icellkernel ].y * temp0[i].y - kernel[icellkernel+inputStride_3].y * temp1[i].y; - float temp_spin_imag0 = kernel[icellkernel ].x * temp0[i].y + kernel[icellkernel+inputStride_3].x * temp1[i].y + kernel[icellkernel ].y * temp0[i].x + kernel[icellkernel+inputStride_3].y * temp1[i].x; - float temp_spin_real1 = kernel[icellkernel+2*inputStride_3].x * temp0[i].x + kernel[icellkernel+3*inputStride_3].x * temp1[i].x - kernel[icellkernel+2*inputStride_3].y * temp0[i].y - kernel[icellkernel+3*inputStride_3].y * temp1[i].y; - float temp_spin_imag1 = kernel[icellkernel+2*inputStride_3].x * temp0[i].y + kernel[icellkernel+3*inputStride_3].x * temp1[i].y + kernel[icellkernel+2*inputStride_3].y * temp0[i].x + kernel[icellkernel+3*inputStride_3].y * temp1[i].x; - temp0[i].x= temp_spin_real0; - temp0[i].y= temp_spin_imag0; - temp1[i].x= temp_spin_real1; - temp1[i].y= temp_spin_imag1; + uint icellkernel= indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x+gl_LocalInvocationID.y*fft_dim+(gl_WorkGroupID.x)*gl_WorkGroupSize.y*fft_dim, 0); + float temp_spin_real0 = kernel[icellkernel ].x * temp0[i].x + kernel[icellkernel+inputStride_3].x * temp1[i].x - kernel[icellkernel ].y * temp0[i].y - kernel[icellkernel+inputStride_3].y * temp1[i].y; + float temp_spin_imag0 = kernel[icellkernel ].x * temp0[i].y + kernel[icellkernel+inputStride_3].x * temp1[i].y + kernel[icellkernel ].y * temp0[i].x + kernel[icellkernel+inputStride_3].y * temp1[i].x; + float temp_spin_real1 = kernel[icellkernel+2*inputStride_3].x * temp0[i].x + kernel[icellkernel+3*inputStride_3].x * temp1[i].x - kernel[icellkernel+2*inputStride_3].y * temp0[i].y - kernel[icellkernel+3*inputStride_3].y * temp1[i].y; + float temp_spin_imag1 = kernel[icellkernel+2*inputStride_3].x * temp0[i].y + kernel[icellkernel+3*inputStride_3].x * temp1[i].y + kernel[icellkernel+2*inputStride_3].y * temp0[i].x + kernel[icellkernel+3*inputStride_3].y * temp1[i].x; + temp0[i].x= temp_spin_real0; + temp0[i].y= temp_spin_imag0; + temp1[i].x= temp_spin_real1; + temp1[i].y= temp_spin_imag1; } //ifft @@ -714,38 +701,24 @@ void main() { memoryBarrierShared(); barrier(); } - - if (zeropad_0){ - if (ratioDirection_1){ - outputs[indexOutput(gl_LocalInvocationID.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]; - - }else{ - for (uint i=0; i<8; i++){ - uint pos = positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_1, ratioDirection_1); - if (pos%fft_dim0)){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_WorkGroupID.x*gl_WorkGroupSize.y+gl_LocalInvocationID.y)*(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x))/float(fft_dim_full)); + uint index=(fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)); + vec2 mult = vec2(cos(angle),-sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; } + memoryBarrierShared(); + barrier(); + } + if ((zeropad_0)){ + for (uint i=0; i < 4; i++) + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x+gl_LocalInvocationID.y*fft_dim+(gl_WorkGroupID.x)*gl_WorkGroupSize.y*fft_dim, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]; + } else { - if (ratioDirection_1){ - outputs[indexOutput(gl_LocalInvocationID.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]; - - }else{ - - for (uint i=0; i<8; i++) - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, coordinate)]=sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_1, ratioDirection_1)]; - - } + for (uint i=0; i < 8; i++) + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x+gl_LocalInvocationID.y*fft_dim+(gl_WorkGroupID.x)*gl_WorkGroupSize.y*fft_dim, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]; } memoryBarrierShared(); diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_nonsymmetric_2x2.spv b/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_nonsymmetric_2x2.spv index 30d49d85b..58900aae4 100644 Binary files a/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_nonsymmetric_2x2.spv and b/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_nonsymmetric_2x2.spv differ diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_nonsymmetric_3x3.comp b/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_nonsymmetric_3x3.comp index 117dce41c..bf7d7c846 100644 --- a/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_nonsymmetric_3x3.comp +++ b/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_nonsymmetric_3x3.comp @@ -3,9 +3,8 @@ const float M_PI = 3.1415926535897932384626433832795; const float M_SQRT1_2 = 0.70710678118654752440084436210485; -layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;// 32, fft/8, 1: total <1024 layout (constant_id = 4) const uint fft_dim = 2048; - layout (constant_id = 5) const bool inverse = false; layout (constant_id = 6) const bool zeropad_0 = false; layout (constant_id = 7) const bool zeropad_1 = false; @@ -19,9 +18,9 @@ layout (constant_id = 14) const uint outputStride_1 = 1; layout (constant_id = 15) const uint outputStride_2 = 1; layout (constant_id = 16) const uint outputStride_3 = 1; layout (constant_id = 17) const uint outputStride_4 = 1; -layout (constant_id = 18) const uint radixStride_0 = 1; -layout (constant_id = 19) const uint radixStride_1 = 1; -layout (constant_id = 20) const uint radixStride_2 = 1; +layout (constant_id = 18) const uint fft_dim_full = 2048; +layout (constant_id = 19) const uint stageStartSize = 2048; +layout (constant_id = 20) const uint fft_dim_x = 2048; layout (constant_id = 21) const uint numStages = 1; layout (constant_id = 22) const uint stageRadix_0 = 8; layout (constant_id = 23) const uint stageRadix_1 = 8; @@ -31,6 +30,10 @@ layout (constant_id = 26) const bool ratioDirection_0 = false; layout (constant_id = 27) const bool ratioDirection_1 = true; layout (constant_id = 28) const uint inputOffset = 0; layout (constant_id = 29) const uint outputOffset = 0; +layout (constant_id = 30) const uint passID = 0; +const uint radixStride_0 = fft_dim/2; +const uint radixStride_1 = fft_dim/4; +const uint radixStride_2 = fft_dim/8; layout(push_constant) uniform PushConsts { @@ -200,43 +203,28 @@ void main() { vec2 temp2[8]; for(uint coordinate=0; coordinate<3; coordinate++){ - if (zeropad_0){ - if (ratioDirection_0){ - for (uint i=0; i<8; i++){ - uint pos=positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - if (pos%fft_dim0)){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_WorkGroupID.x*gl_WorkGroupSize.y+gl_LocalInvocationID.y)*(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x))/float(fft_dim_full)); + uint index=(fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)); + vec2 mult = vec2(cos(angle),sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; + } + memoryBarrierShared(); + barrier(); + } uint stageSize=1; float stageAngle=M_PI; @@ -483,7 +471,7 @@ void main() { //3x3 nonsymmetric convolution for a 3d vector for (uint i=0; i<8; i++){ - uint icellkernel= indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 0); + uint icellkernel= indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x+gl_LocalInvocationID.y*fft_dim+(gl_WorkGroupID.x)*gl_WorkGroupSize.y*fft_dim, 0); float temp_spin_real0 = kernel[icellkernel ].x * temp0[i].x + kernel[icellkernel+inputStride_3].x * temp1[i].x + kernel[icellkernel+2*inputStride_3].x * temp2[i].x - kernel[icellkernel ].y * temp0[i].y - kernel[icellkernel+inputStride_3].y * temp1[i].y - kernel[icellkernel+2*inputStride_3].y * temp2[i].y; float temp_spin_imag0 = kernel[icellkernel ].x * temp0[i].y + kernel[icellkernel+inputStride_3].x * temp1[i].y + kernel[icellkernel+2*inputStride_3].x * temp2[i].y + kernel[icellkernel ].y * temp0[i].x + kernel[icellkernel+inputStride_3].y * temp1[i].x + kernel[icellkernel+2*inputStride_3].y * temp2[i].x; float temp_spin_real1 = kernel[icellkernel+3*inputStride_3].x * temp0[i].x + kernel[icellkernel+4*inputStride_3].x * temp1[i].x + kernel[icellkernel+5*inputStride_3].x * temp2[i].x - kernel[icellkernel+3*inputStride_3].y * temp0[i].y - kernel[icellkernel+4*inputStride_3].y * temp1[i].y - kernel[icellkernel+5*inputStride_3].y * temp2[i].y; @@ -742,37 +730,24 @@ void main() { barrier(); } - if (zeropad_0){ - if (ratioDirection_1){ - outputs[indexOutput(gl_LocalInvocationID.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]; - - }else{ - for (uint i=0; i<8; i++){ - uint pos = positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_1, ratioDirection_1); - if (pos%fft_dim0)){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_WorkGroupID.x*gl_WorkGroupSize.y+gl_LocalInvocationID.y)*(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x))/float(fft_dim_full)); + uint index=(fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)); + vec2 mult = vec2(cos(angle),-sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; } + memoryBarrierShared(); + barrier(); + } + if ((zeropad_0)){ + for (uint i=0; i < 4; i++) + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x+gl_LocalInvocationID.y*fft_dim+(gl_WorkGroupID.x)*gl_WorkGroupSize.y*fft_dim, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]; + } else { - if (ratioDirection_1){ - outputs[indexOutput(gl_LocalInvocationID.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]; - - }else{ - - for (uint i=0; i<8; i++) - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, coordinate)]=sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_1, ratioDirection_1)]; - - } + for (uint i=0; i < 8; i++) + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x+gl_LocalInvocationID.y*fft_dim+(gl_WorkGroupID.x)*gl_WorkGroupSize.y*fft_dim, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]; } memoryBarrierShared(); diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_nonsymmetric_3x3.spv b/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_nonsymmetric_3x3.spv index e55223946..c1293f779 100644 Binary files a/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_nonsymmetric_3x3.spv and b/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_nonsymmetric_3x3.spv differ diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_symmetric_2x2.comp b/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_symmetric_2x2.comp index a1d7772db..be160a927 100644 --- a/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_symmetric_2x2.comp +++ b/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_symmetric_2x2.comp @@ -3,9 +3,8 @@ const float M_PI = 3.1415926535897932384626433832795; const float M_SQRT1_2 = 0.70710678118654752440084436210485; -layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;// 32, fft/8, 1: total <1024 layout (constant_id = 4) const uint fft_dim = 2048; - layout (constant_id = 5) const bool inverse = false; layout (constant_id = 6) const bool zeropad_0 = false; layout (constant_id = 7) const bool zeropad_1 = false; @@ -19,9 +18,9 @@ layout (constant_id = 14) const uint outputStride_1 = 1; layout (constant_id = 15) const uint outputStride_2 = 1; layout (constant_id = 16) const uint outputStride_3 = 1; layout (constant_id = 17) const uint outputStride_4 = 1; -layout (constant_id = 18) const uint radixStride_0 = 1; -layout (constant_id = 19) const uint radixStride_1 = 1; -layout (constant_id = 20) const uint radixStride_2 = 1; +layout (constant_id = 18) const uint fft_dim_full = 2048; +layout (constant_id = 19) const uint stageStartSize = 2048; +layout (constant_id = 20) const uint fft_dim_x = 2048; layout (constant_id = 21) const uint numStages = 1; layout (constant_id = 22) const uint stageRadix_0 = 8; layout (constant_id = 23) const uint stageRadix_1 = 8; @@ -31,6 +30,10 @@ layout (constant_id = 26) const bool ratioDirection_0 = false; layout (constant_id = 27) const bool ratioDirection_1 = true; layout (constant_id = 28) const uint inputOffset = 0; layout (constant_id = 29) const uint outputOffset = 0; +layout (constant_id = 30) const uint passID = 0; +const uint radixStride_0 = fft_dim/2; +const uint radixStride_1 = fft_dim/4; +const uint radixStride_2 = fft_dim/8; layout(push_constant) uniform PushConsts { @@ -199,43 +202,28 @@ void main() { vec2 temp1[8]; for(uint coordinate=0; coordinate<2; coordinate++){ - if (zeropad_0){ - if (ratioDirection_0){ - for (uint i=0; i<8; i++){ - uint pos=positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - if (pos%fft_dim0)){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_WorkGroupID.x*gl_WorkGroupSize.y+gl_LocalInvocationID.y)*(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x))/float(fft_dim_full)); + uint index=(fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)); + vec2 mult = vec2(cos(angle),sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; + } + memoryBarrierShared(); + barrier(); + } uint stageSize=1; float stageAngle=M_PI; @@ -470,15 +458,15 @@ void main() { //2x2 symmetric convolution for a 2d vector for (uint i=0; i<8; i++){ - uint icellkernel= indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 0); - float temp_spin_real0 = kernel[icellkernel ].x * temp0[i].x + kernel[icellkernel+inputStride_3].x * temp1[i].x - kernel[icellkernel ].y * temp0[i].y - kernel[icellkernel+inputStride_3].y * temp1[i].y; - float temp_spin_imag0 = kernel[icellkernel ].x * temp0[i].y + kernel[icellkernel+inputStride_3].x * temp1[i].y + kernel[icellkernel ].y * temp0[i].x + kernel[icellkernel+inputStride_3].y * temp1[i].x; - float temp_spin_real1 = kernel[icellkernel+inputStride_3].x * temp0[i].x + kernel[icellkernel+2*inputStride_3].x * temp1[i].x - kernel[icellkernel+inputStride_3].y * temp0[i].y - kernel[icellkernel+2*inputStride_3].y * temp1[i].y; - float temp_spin_imag1 = kernel[icellkernel+inputStride_3].x * temp0[i].y + kernel[icellkernel+2*inputStride_3].x * temp1[i].y + kernel[icellkernel+inputStride_3].y * temp0[i].x + kernel[icellkernel+2*inputStride_3].y * temp1[i].x; - temp0[i].x= temp_spin_real0; - temp0[i].y= temp_spin_imag0; - temp1[i].x= temp_spin_real1; - temp1[i].y= temp_spin_imag1; + uint icellkernel= indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x+gl_LocalInvocationID.y*fft_dim+(gl_WorkGroupID.x)*gl_WorkGroupSize.y*fft_dim, 0); + float temp_spin_real0 = kernel[icellkernel ].x * temp0[i].x + kernel[icellkernel+inputStride_3].x * temp1[i].x - kernel[icellkernel ].y * temp0[i].y - kernel[icellkernel+inputStride_3].y * temp1[i].y; + float temp_spin_imag0 = kernel[icellkernel ].x * temp0[i].y + kernel[icellkernel+inputStride_3].x * temp1[i].y + kernel[icellkernel ].y * temp0[i].x + kernel[icellkernel+inputStride_3].y * temp1[i].x; + float temp_spin_real1 = kernel[icellkernel+inputStride_3].x * temp0[i].x + kernel[icellkernel+2*inputStride_3].x * temp1[i].x - kernel[icellkernel+inputStride_3].y * temp0[i].y - kernel[icellkernel+2*inputStride_3].y * temp1[i].y; + float temp_spin_imag1 = kernel[icellkernel+inputStride_3].x * temp0[i].y + kernel[icellkernel+2*inputStride_3].x * temp1[i].y + kernel[icellkernel+inputStride_3].y * temp0[i].x + kernel[icellkernel+2*inputStride_3].y * temp1[i].x; + temp0[i].x= temp_spin_real0; + temp0[i].y= temp_spin_imag0; + temp1[i].x= temp_spin_real1; + temp1[i].y= temp_spin_imag1; } //ifft @@ -714,37 +702,24 @@ void main() { memoryBarrierShared(); barrier(); } - if (zeropad_0){ - if (ratioDirection_1){ - outputs[indexOutput(gl_LocalInvocationID.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]; - - }else{ - for (uint i=0; i<8; i++){ - uint pos = positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_1, ratioDirection_1); - if (pos%fft_dim0)){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_WorkGroupID.x*gl_WorkGroupSize.y+gl_LocalInvocationID.y)*(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x))/float(fft_dim_full)); + uint index=(fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)); + vec2 mult = vec2(cos(angle),-sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; } + memoryBarrierShared(); + barrier(); + } + if ((zeropad_0)){ + for (uint i=0; i < 4; i++) + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x+gl_LocalInvocationID.y*fft_dim+(gl_WorkGroupID.x)*gl_WorkGroupSize.y*fft_dim, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]; + } else { - if (ratioDirection_1){ - outputs[indexOutput(gl_LocalInvocationID.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]; - - }else{ - - for (uint i=0; i<8; i++) - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, coordinate)]=sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_1, ratioDirection_1)]; - - } + for (uint i=0; i < 8; i++) + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x+gl_LocalInvocationID.y*fft_dim+(gl_WorkGroupID.x)*gl_WorkGroupSize.y*fft_dim, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]; } memoryBarrierShared(); barrier(); diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_symmetric_2x2.spv b/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_symmetric_2x2.spv index 8c59e02fc..64aa5110a 100644 Binary files a/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_symmetric_2x2.spv and b/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_symmetric_2x2.spv differ diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_symmetric_3x3.comp b/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_symmetric_3x3.comp index e587d02c2..09950888f 100644 --- a/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_symmetric_3x3.comp +++ b/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_symmetric_3x3.comp @@ -2,11 +2,9 @@ const float M_PI = 3.1415926535897932384626433832795; const float M_SQRT1_2 = 0.70710678118654752440084436210485; -const float mult = 1e-7; -layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;// 32, fft/8, 1: total <1024 layout (constant_id = 4) const uint fft_dim = 2048; - layout (constant_id = 5) const bool inverse = false; layout (constant_id = 6) const bool zeropad_0 = false; layout (constant_id = 7) const bool zeropad_1 = false; @@ -20,9 +18,9 @@ layout (constant_id = 14) const uint outputStride_1 = 1; layout (constant_id = 15) const uint outputStride_2 = 1; layout (constant_id = 16) const uint outputStride_3 = 1; layout (constant_id = 17) const uint outputStride_4 = 1; -layout (constant_id = 18) const uint radixStride_0 = 1; -layout (constant_id = 19) const uint radixStride_1 = 1; -layout (constant_id = 20) const uint radixStride_2 = 1; +layout (constant_id = 18) const uint fft_dim_full = 2048; +layout (constant_id = 19) const uint stageStartSize = 2048; +layout (constant_id = 20) const uint fft_dim_x = 2048; layout (constant_id = 21) const uint numStages = 1; layout (constant_id = 22) const uint stageRadix_0 = 8; layout (constant_id = 23) const uint stageRadix_1 = 8; @@ -32,6 +30,10 @@ layout (constant_id = 26) const bool ratioDirection_0 = false; layout (constant_id = 27) const bool ratioDirection_1 = true; layout (constant_id = 28) const uint inputOffset = 0; layout (constant_id = 29) const uint outputOffset = 0; +layout (constant_id = 30) const uint passID = 0; +const uint radixStride_0 = fft_dim/2; +const uint radixStride_1 = fft_dim/4; +const uint radixStride_2 = fft_dim/8; layout(push_constant) uniform PushConsts { @@ -200,43 +202,28 @@ void main() { vec2 temp2[8]; for(uint coordinate=0; coordinate<3; coordinate++){ - if (zeropad_0){ - if (ratioDirection_0){ - for (uint i=0; i<8; i++){ - uint pos=positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_0, ratioDirection_0); - if (pos%fft_dim0)){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_WorkGroupID.x*gl_WorkGroupSize.y+gl_LocalInvocationID.y)*(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x))/float(fft_dim_full)); + uint index=(fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)); + vec2 mult = vec2(cos(angle),sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; + } + memoryBarrierShared(); + barrier(); + } uint stageSize=1; float stageAngle=M_PI; @@ -483,19 +470,19 @@ void main() { //3x3 symmetric convolution for a 3d vector for (uint i=0; i<8; i++){ - uint icellkernel= indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, 0); - float temp_spin_real0 = kernel[icellkernel ].x * temp0[i].x + kernel[icellkernel+inputStride_3].x * temp1[i].x + kernel[icellkernel+2*inputStride_3].x * temp2[i].x - kernel[icellkernel ].y * temp0[i].y - kernel[icellkernel+inputStride_3].y * temp1[i].y - kernel[icellkernel+2*inputStride_3].y * temp2[i].y; - float temp_spin_imag0 = kernel[icellkernel ].x * temp0[i].y + kernel[icellkernel+inputStride_3].x * temp1[i].y + kernel[icellkernel+2*inputStride_3].x * temp2[i].y + kernel[icellkernel ].y * temp0[i].x + kernel[icellkernel+inputStride_3].y * temp1[i].x + kernel[icellkernel+2*inputStride_3].y * temp2[i].x; - float temp_spin_real1 = kernel[icellkernel+inputStride_3].x * temp0[i].x + kernel[icellkernel+3*inputStride_3].x * temp1[i].x + kernel[icellkernel+4*inputStride_3].x * temp2[i].x - kernel[icellkernel+inputStride_3].y * temp0[i].y - kernel[icellkernel+3*inputStride_3].y * temp1[i].y - kernel[icellkernel+4*inputStride_3].y * temp2[i].y; - float temp_spin_imag1 = kernel[icellkernel+inputStride_3].x * temp0[i].y + kernel[icellkernel+3*inputStride_3].x * temp1[i].y + kernel[icellkernel+4*inputStride_3].x * temp2[i].y + kernel[icellkernel+inputStride_3].y * temp0[i].x + kernel[icellkernel+3*inputStride_3].y * temp1[i].x + kernel[icellkernel+4*inputStride_3].y * temp2[i].x; - float temp_spin_real2 = kernel[icellkernel+2*inputStride_3].x * temp0[i].x + kernel[icellkernel+4*inputStride_3].x * temp1[i].x + kernel[icellkernel+5*inputStride_3].x * temp2[i].x - kernel[icellkernel+2*inputStride_3].y * temp0[i].y - kernel[icellkernel+4*inputStride_3].y * temp1[i].y - kernel[icellkernel+5*inputStride_3].y * temp2[i].y; - float temp_spin_imag2 = kernel[icellkernel+2*inputStride_3].x * temp0[i].y + kernel[icellkernel+4*inputStride_3].x * temp1[i].y + kernel[icellkernel+5*inputStride_3].x * temp2[i].y + kernel[icellkernel+2*inputStride_3].y * temp0[i].x + kernel[icellkernel+4*inputStride_3].y * temp1[i].x + kernel[icellkernel+5*inputStride_3].y * temp2[i].x; - temp0[i].x= temp_spin_real0; - temp0[i].y= temp_spin_imag0; - temp1[i].x= temp_spin_real1; - temp1[i].y= temp_spin_imag1; - temp2[i].x= temp_spin_real2; - temp2[i].y= temp_spin_imag2; + uint icellkernel= indexInput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x+gl_LocalInvocationID.y*fft_dim+(gl_WorkGroupID.x)*gl_WorkGroupSize.y*fft_dim, 0); + float temp_spin_real0 = kernel[icellkernel ].x * temp0[i].x + kernel[icellkernel+inputStride_3].x * temp1[i].x + kernel[icellkernel+2*inputStride_3].x * temp2[i].x - kernel[icellkernel ].y * temp0[i].y - kernel[icellkernel+inputStride_3].y * temp1[i].y - kernel[icellkernel+2*inputStride_3].y * temp2[i].y; + float temp_spin_imag0 = kernel[icellkernel ].x * temp0[i].y + kernel[icellkernel+inputStride_3].x * temp1[i].y + kernel[icellkernel+2*inputStride_3].x * temp2[i].y + kernel[icellkernel ].y * temp0[i].x + kernel[icellkernel+inputStride_3].y * temp1[i].x + kernel[icellkernel+2*inputStride_3].y * temp2[i].x; + float temp_spin_real1 = kernel[icellkernel+inputStride_3].x * temp0[i].x + kernel[icellkernel+3*inputStride_3].x * temp1[i].x + kernel[icellkernel+4*inputStride_3].x * temp2[i].x - kernel[icellkernel+inputStride_3].y * temp0[i].y - kernel[icellkernel+3*inputStride_3].y * temp1[i].y - kernel[icellkernel+4*inputStride_3].y * temp2[i].y; + float temp_spin_imag1 = kernel[icellkernel+inputStride_3].x * temp0[i].y + kernel[icellkernel+3*inputStride_3].x * temp1[i].y + kernel[icellkernel+4*inputStride_3].x * temp2[i].y + kernel[icellkernel+inputStride_3].y * temp0[i].x + kernel[icellkernel+3*inputStride_3].y * temp1[i].x + kernel[icellkernel+4*inputStride_3].y * temp2[i].x; + float temp_spin_real2 = kernel[icellkernel+2*inputStride_3].x * temp0[i].x + kernel[icellkernel+4*inputStride_3].x * temp1[i].x + kernel[icellkernel+5*inputStride_3].x * temp2[i].x - kernel[icellkernel+2*inputStride_3].y * temp0[i].y - kernel[icellkernel+4*inputStride_3].y * temp1[i].y - kernel[icellkernel+5*inputStride_3].y * temp2[i].y; + float temp_spin_imag2 = kernel[icellkernel+2*inputStride_3].x * temp0[i].y + kernel[icellkernel+4*inputStride_3].x * temp1[i].y + kernel[icellkernel+5*inputStride_3].x * temp2[i].y + kernel[icellkernel+2*inputStride_3].y * temp0[i].x + kernel[icellkernel+4*inputStride_3].y * temp1[i].x + kernel[icellkernel+5*inputStride_3].y * temp2[i].x; + temp0[i].x= temp_spin_real0; + temp0[i].y= temp_spin_imag0; + temp1[i].x= temp_spin_real1; + temp1[i].y= temp_spin_imag1; + temp2[i].x= temp_spin_real2; + temp2[i].y= temp_spin_imag2; } //ifft @@ -742,37 +729,24 @@ void main() { barrier(); } - if (zeropad_0){ - if (ratioDirection_1){ - outputs[indexOutput(gl_LocalInvocationID.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]; - - }else{ - for (uint i=0; i<8; i++){ - uint pos = positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_1, ratioDirection_1); - if (pos%fft_dim0)){ + for (uint i=0; i < 8; i++){ + float angle=2*M_PI*(((gl_WorkGroupID.x*gl_WorkGroupSize.y+gl_LocalInvocationID.y)*(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x))/float(fft_dim_full)); + uint index=(fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)); + vec2 mult = vec2(cos(angle),-sin(angle)); + vec2 res=vec2(sdata[index].x*mult.x-sdata[index].y*mult.y,sdata[index].y*mult.x+sdata[index].x*mult.y); + sdata[index]=res; } + memoryBarrierShared(); + barrier(); + } + if ((zeropad_0)){ + for (uint i=0; i < 4; i++) + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x+gl_LocalInvocationID.y*fft_dim+(gl_WorkGroupID.x)*gl_WorkGroupSize.y*fft_dim, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]; + } else { - if (ratioDirection_1){ - outputs[indexOutput(gl_LocalInvocationID.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+2*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+2*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+3*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+3*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+4*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+4*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+5*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+5*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+6*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+6*gl_WorkGroupSize.x)]; - outputs[indexOutput(gl_LocalInvocationID.x+7*gl_WorkGroupSize.x, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+7*gl_WorkGroupSize.x)]; - - }else{ - - for (uint i=0; i<8; i++) - outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, coordinate)]=sdata[positionShuffle(fft_dim*gl_LocalInvocationID.y + gl_LocalInvocationID.x+i*gl_WorkGroupSize.x, ratio_1, ratioDirection_1)]; - - } + for (uint i=0; i < 8; i++) + outputs[indexOutput(gl_LocalInvocationID.x+i*gl_WorkGroupSize.x+gl_LocalInvocationID.y*fft_dim+(gl_WorkGroupID.x)*gl_WorkGroupSize.y*fft_dim, coordinate)]=sdata[fft_dim*gl_LocalInvocationID.y + (gl_LocalInvocationID.x+i*gl_WorkGroupSize.x)]; } memoryBarrierShared(); diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_symmetric_3x3.spv b/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_symmetric_3x3.spv index 07d3bed6b..df2f518f5 100644 Binary files a/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_symmetric_3x3.spv and b/core/thirdparty/VkFFT/shaders/vkFFT_single_convolution_symmetric_3x3.spv differ diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_single_r2c.comp b/core/thirdparty/VkFFT/shaders/vkFFT_single_r2c.comp index 7a4957a27..37c2c7425 100644 --- a/core/thirdparty/VkFFT/shaders/vkFFT_single_r2c.comp +++ b/core/thirdparty/VkFFT/shaders/vkFFT_single_r2c.comp @@ -3,9 +3,8 @@ const float M_PI = 3.1415926535897932384626433832795; const float M_SQRT1_2 = 0.70710678118654752440084436210485; -layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in; +layout (local_size_x_id = 1, local_size_y_id = 2, local_size_z_id = 3) in;// 32, fft/8, 1: total <1024 layout (constant_id = 4) const uint fft_dim = 2048; - layout (constant_id = 5) const bool inverse = false; layout (constant_id = 6) const bool zeropad_0 = false; layout (constant_id = 7) const bool zeropad_1 = false; @@ -19,9 +18,9 @@ layout (constant_id = 14) const uint outputStride_1 = 1; layout (constant_id = 15) const uint outputStride_2 = 1; layout (constant_id = 16) const uint outputStride_3 = 1; layout (constant_id = 17) const uint outputStride_4 = 1; -layout (constant_id = 18) const uint radixStride_0 = 1; -layout (constant_id = 19) const uint radixStride_1 = 1; -layout (constant_id = 20) const uint radixStride_2 = 1; +layout (constant_id = 18) const uint fft_dim_full = 2048; +layout (constant_id = 19) const uint stageStartSize = 2048; +layout (constant_id = 20) const uint fft_dim_x = 2048; layout (constant_id = 21) const uint numStages = 1; layout (constant_id = 22) const uint stageRadix_0 = 8; layout (constant_id = 23) const uint stageRadix_1 = 8; @@ -31,6 +30,10 @@ layout (constant_id = 26) const bool ratioDirection_0 = false; layout (constant_id = 27) const bool ratioDirection_1 = true; layout (constant_id = 28) const uint inputOffset = 0; layout (constant_id = 29) const uint outputOffset = 0; +layout (constant_id = 30) const uint passID = 0; +const uint radixStride_0 = fft_dim/2; +const uint radixStride_1 = fft_dim/4; +const uint radixStride_2 = fft_dim/8; layout(push_constant) uniform PushConsts { diff --git a/core/thirdparty/VkFFT/shaders/vkFFT_single_r2c.spv b/core/thirdparty/VkFFT/shaders/vkFFT_single_r2c.spv index 319847a30..b30ffa4bc 100644 Binary files a/core/thirdparty/VkFFT/shaders/vkFFT_single_r2c.spv and b/core/thirdparty/VkFFT/shaders/vkFFT_single_r2c.spv differ diff --git a/core/thirdparty/VkFFT/vkFFT.h b/core/thirdparty/VkFFT/vkFFT.h index 800e85879..278e7e77d 100644 --- a/core/thirdparty/VkFFT/vkFFT.h +++ b/core/thirdparty/VkFFT/vkFFT.h @@ -12,13 +12,14 @@ typedef struct { uint32_t FFTdim = 1; //FFT dimensionality (1, 2 or 3) uint32_t radix = 8; //FFT radix (2, 4 or 8) bool performZeropadding[3] = { false, false, false }; // perform zeropadding (false - off, true - on) - bool performTranspose[2] = { true, true }; //will be selected automatically + bool performTranspose[2] = { false, false }; //will be selected automatically bool performConvolution = false; //perform convolution in this application (false - off, true - on) bool performR2C = false; //perform R2C/C2R decomposition (false - off, true - on) bool inverse = false; //perform inverse FFT (false - forward, true - inverse) bool symmetricKernel = false; //specify if kernel in 2x2 or 3x3 matrix convolution is symmetric bool isInputFormatted = false; //specify if input buffer is not padded for R2C if out-of-place mode is selected (only if numberBatches==1 and numberKernels==1) - false - padded, true - not padded bool isOutputFormatted = false; //specify if output buffer is not padded for R2C if out-of-place mode is selected (only if numberBatches==1 and numberKernels==1) - false - padded, true - not padded + uint32_t registerBoost = 1; //specify if register file size is bigger than shared memory (on Nvidia 256KB register file can be used instead of 32KB of shared memory, set this constant to 4) char shaderPath[256] = "shaders/"; //path to shaders, can be selected automatically in CMake uint32_t coalescedMemory = 32;//in bits, for Nvidia compute capability >=6.0 is equal to 32, <6.0 and Intel is equal 128. Gonna work regardles, but if specified by user correctly, the performance will be higher. VkDevice* device; @@ -42,13 +43,16 @@ typedef struct { VkBool32 zeropad[2]; uint32_t inputStride[5]; uint32_t outputStride[5]; - uint32_t radixStride[3]; + uint32_t fft_dim_full; + uint32_t stageStartSize; + uint32_t fft_dim_x; uint32_t numStages; uint32_t stageRadix[2] = { 0,0 }; uint32_t ratio[2]; VkBool32 ratioDirection[2]; uint32_t inputOffset; uint32_t outputOffset; + uint32_t passID; } VkFFTSpecializationConstantsLayout; typedef struct { @@ -84,9 +88,10 @@ typedef struct { VkPipeline pipeline; } VkFFTTranspose; typedef struct { - - VkFFTAxis axes[3]; - VkFFTAxis supportAxes[2];//Nx/2+1 for r2c/c2r + uint32_t numAxisUploads[3]; + uint32_t numSupportAxisUploads[2]; + VkFFTAxis axes[3][5]; + VkFFTAxis supportAxes[2][5];//Nx/2+1 for r2c/c2r VkFFTTranspose transpose[2]; } VkFFTPlan; @@ -132,8 +137,8 @@ typedef struct VkFFTApplication { sprintf(filename, "%s%s", configuration.shaderPath, "vkFFT_single_c2r.spv"); break; case 2: - //printf("vkFFT_single_c2r_zp\n"); - sprintf(filename, "%s%s", configuration.shaderPath, "vkFFT_single_c2r_zp.spv"); + //printf("vkFFT_single_c2c_strided\n"); + sprintf(filename, "%s%s", configuration.shaderPath, "vkFFT_single_c2c_strided.spv"); break; case 3: //printf("vkFFT_single_r2c\n"); @@ -164,8 +169,8 @@ typedef struct VkFFTApplication { sprintf(filename, "%s%s", configuration.shaderPath, "vkFFT_single_convolution_1x1.spv"); break; case 10: - //printf("vkFFT_single_convolution_afterR2C_1x1\n"); - sprintf(filename, "%s%s", configuration.shaderPath, "vkFFT_single_convolution_afterR2C_1x1.spv"); + //printf("vkFFT_single_strided_convolution_1x1\n"); + sprintf(filename, "%s%s", configuration.shaderPath, "vkFFT_single_strided_convolution_1x1.spv"); break; case 11: //printf("vkFFT_grouped_convolution_symmetric_2x2\n"); @@ -176,8 +181,8 @@ typedef struct VkFFTApplication { sprintf(filename, "%s%s", configuration.shaderPath, "vkFFT_single_convolution_symmetric_2x2.spv"); break; case 13: - //printf("vkFFT_single_convolution_afterR2C_symmetric_2x2\n"); - sprintf(filename, "%s%s", configuration.shaderPath, "vkFFT_single_convolution_afterR2C_symmetric_2x2.spv"); + //printf("vkFFT_single_strided_convolution_symmetric_2x2\n"); + sprintf(filename, "%s%s", configuration.shaderPath, "vkFFT_single_strided_convolution_symmetric_2x2.spv"); break; case 14: //printf("vkFFT_grouped_convolution_nonsymmetric_2x2\n"); @@ -188,8 +193,8 @@ typedef struct VkFFTApplication { sprintf(filename, "%s%s", configuration.shaderPath, "vkFFT_single_convolution_nonsymmetric_2x2.spv"); break; case 16: - //printf("vkFFT_single_convolution_afterR2C_nonsymmetric_2x2\n"); - sprintf(filename, "%s%s", configuration.shaderPath, "vkFFT_single_convolution_afterR2C_nonsymmetric_2x2.spv"); + //printf("vkFFT_single_strided_convolution_nonsymmetric_2x2\n"); + sprintf(filename, "%s%s", configuration.shaderPath, "vkFFT_single_strided_convolution_nonsymmetric_2x2.spv"); break; case 17: //printf("vkFFT_grouped_convolution_symmetric_3x3\n"); @@ -200,8 +205,8 @@ typedef struct VkFFTApplication { sprintf(filename, "%s%s", configuration.shaderPath, "vkFFT_single_convolution_symmetric_3x3.spv"); break; case 19: - //printf("vkFFT_single_convolution_afterR2C_symmetric_3x3\n"); - sprintf(filename, "%s%s", configuration.shaderPath, "vkFFT_single_convolution_afterR2C_symmetric_3x3.spv"); + //printf("vkFFT_single_strided_convolution_symmetric_3x3\n"); + sprintf(filename, "%s%s", configuration.shaderPath, "vkFFT_single_strided_convolution_symmetric_3x3.spv"); break; case 20: //printf("vkFFT_grouped_convolution_nonsymmetric_3x3\n"); @@ -212,8 +217,8 @@ typedef struct VkFFTApplication { sprintf(filename, "%s%s", configuration.shaderPath, "vkFFT_single_convolution_nonsymmetric_3x3.spv"); break; case 22: - //printf("vkFFT_single_convolution_afterR2C_nonsymmetric_3x3\n"); - sprintf(filename, "%s%s", configuration.shaderPath, "vkFFT_single_convolution_afterR2C_nonsymmetric_3x3.spv"); + //printf("vkFFT_single_strided_convolution_nonsymmetric_3x3\n"); + sprintf(filename, "%s%s", configuration.shaderPath, "vkFFT_single_strided_convolution_nonsymmetric_3x3.spv"); break; case 23: //printf("vkFFT_single_c2r_8192\n"); @@ -228,26 +233,57 @@ typedef struct VkFFTApplication { sprintf(filename, "%s%s", configuration.shaderPath, "8192/vkFFT_single_c2c_8192.spv"); break; case 26: - //printf("vkFFT_single_c2r_for_transposition_8192\n"); - sprintf(filename, "%s%s", configuration.shaderPath, "8192/vkFFT_single_c2r_for_transposition_8192.spv"); + //printf("vkFFT_grouped_strided_convolution_1x1\n"); + sprintf(filename, "%s%s", configuration.shaderPath, "vkFFT_grouped_strided_convolution_1x1.spv"); break; case 27: - //printf("vkFFT_single_r2c_for_transposition_8192\n"); - sprintf(filename, "%s%s", configuration.shaderPath, "8192/vkFFT_single_r2c_for_transposition_8192.spv"); + //printf("vkFFT_grouped_strided_convolution_symmetric_2x2\n"); + sprintf(filename, "%s%s", configuration.shaderPath, "vkFFT_grouped_strided_convolution_symmetric_2x2.spv"); break; case 28: - //printf("vkFFT_single_c2c_for_transposition_8192\n"); - sprintf(filename, "%s%s", configuration.shaderPath, "8192/vkFFT_single_c2c_for_transposition_8192.spv"); + //printf("vkFFT_grouped_strided_convolution_nonsymmetric_2x2\n"); + sprintf(filename, "%s%s", configuration.shaderPath, "vkFFT_grouped_strided_convolution_nonsymmetric_2x2.spv"); break; case 29: - //printf("vkFFT_single_c2c_afterR2C_for_transposition_8192\n"); - sprintf(filename, "%s%s", configuration.shaderPath, "8192/vkFFT_single_c2c_afterR2C_for_transposition_8192.spv"); + //printf("vkFFT_grouped_strided_convolution_symmetric_3x3\n"); + sprintf(filename, "%s%s", configuration.shaderPath, "vkFFT_grouped_strided_convolution_symmetric_3x3.spv"); break; case 30: - //printf("vkFFT_single_c2c_beforeC2R_for_transposition_8192\n"); - sprintf(filename, "%s%s", configuration.shaderPath, "8192/vkFFT_single_c2c_beforeC2R_for_transposition_8192.spv"); + //printf("vkFFT_grouped_strided_convolution_nonsymmetric_3x3\n"); + sprintf(filename, "%s%s", configuration.shaderPath, "vkFFT_grouped_strided_convolution_nonsymmetric_3x3.spv"); + break; + case 33: + //printf("vkFFT_single_c2r_16384\n"); + sprintf(filename, "%s%s", configuration.shaderPath, "16384/vkFFT_single_c2r_16384.spv"); + break; + case 34: + //printf("vkFFT_single_r2c_16384\n"); + sprintf(filename, "%s%s", configuration.shaderPath, "16384/vkFFT_single_r2c_16384.spv"); + break; + case 35: + //printf("vkFFT_single_c2c_16384\n"); + sprintf(filename, "%s%s", configuration.shaderPath, "16384/vkFFT_single_c2c_16384.spv"); + break; + case 36: + //printf("vkFFT_single_c2r_for_transposition_16384\n"); + sprintf(filename, "%s%s", configuration.shaderPath, "16384/vkFFT_single_c2r_for_transposition_16384.spv"); + break; + case 37: + //printf("vkFFT_single_r2c_for_transposition_16384\n"); + sprintf(filename, "%s%s", configuration.shaderPath, "16384/vkFFT_single_r2c_for_transposition_16384.spv"); + break; + case 38: + //printf("vkFFT_single_c2c_for_transposition_16384\n"); + sprintf(filename, "%s%s", configuration.shaderPath, "16384/vkFFT_single_c2c_for_transposition_16384.spv"); + break; + case 39: + //printf("vkFFT_single_c2c_afterR2C_for_transposition_16384\n"); + sprintf(filename, "%s%s", configuration.shaderPath, "16384/vkFFT_single_c2c_afterR2C_for_transposition_16384.spv"); + break; + case 40: + //printf("vkFFT_single_c2c_beforeC2R_for_transposition_16384\n"); + sprintf(filename, "%s%s", configuration.shaderPath, "16384/vkFFT_single_c2c_beforeC2R_for_transposition_16384.spv"); break; - } @@ -260,61 +296,410 @@ typedef struct VkFFTApplication { free(code); } - void VkFFTPlanAxis(VkFFTPlan* FFTPlan, uint32_t axis_id, bool inverse) { + void VkFFTPlanAxis(VkFFTPlan* FFTPlan, uint32_t axis_id, uint32_t axis_upload_id, bool inverse) { //get radix stages - VkFFTAxis* axis = &FFTPlan->axes[axis_id]; - //for (uint32_t i; i<3; i++) - // axis->specializationConstants.size[i] = configuration.size[i]; - - //configure radix stages - uint32_t logSize = log2(configuration.size[axis_id]); - switch (configuration.radix) { - case 8: { - uint32_t stage8 = logSize / 3; - uint32_t stage4 = 0; - uint32_t stage2 = 0; - if (logSize % 3 == 2) - stage4 = 1; - if (logSize % 3 == 1) - stage2 = 1; - axis->specializationConstants.numStages = stage8 + stage4 + stage2; - - axis->specializationConstants.stageRadix[0] = 8; - axis->specializationConstants.stageRadix[1] = 8; - if (logSize % 3 == 2) - axis->specializationConstants.stageRadix[1] = 4; - if (logSize % 3 == 1) - axis->specializationConstants.stageRadix[1] = 2; - break; - } - case 4: { - uint32_t stage4 = logSize / 2; - uint32_t stage2 = 0; - if (logSize % 2 == 1) - stage2 = 1; - axis->specializationConstants.numStages = stage4 + stage2; + VkFFTAxis* axis = &FFTPlan->axes[axis_id][axis_upload_id]; + + if (axis_id == 0) { + //configure radix stages + uint32_t logSize = log2(configuration.size[axis_id]); + uint32_t numPasses[8][8];//4096-8k(256KB)-16k(256KB)-32k-64k - find correct strided FFT configuration - x axis | 256-512-1024-2048(256KB)-4096(256KB)-8k(future?)-16k(future?) - find correct strided FFT configuration + for (uint32_t i = 0; i < 8; i++) { + for (uint32_t j = 0; j < 8; j++) { + numPasses[i][j] = 0; + } + } + uint32_t temp = configuration.size[axis_id]; + uint32_t startStage = 4096; + uint32_t continueStage = 256; + uint32_t maxPassId[2] = { 0,0 }; + uint32_t minPassId[2] = { 0,0 }; + maxPassId[0] += log2(configuration.registerBoost); + uint32_t maxSingleSize = 8 * 4096 / configuration.coalescedMemory; + maxPassId[1] = log2(maxSingleSize / 256); + minPassId[1] = (maxSingleSize >= 512) ? 1 : 0; + //maxPassId[1] += log2(configuration.registerBoost);//in development + for (uint32_t i = 0; i < 8; i++) { + for (uint32_t j = 0; j < 8; j++) { + temp /= startStage; + numPasses[i][j]++; + while (temp > 1) + { + temp /= continueStage; + numPasses[i][j]++; + } + continueStage *= 2; + temp = configuration.size[axis_id]; + } + continueStage = 256; + startStage *= 2; + } + uint32_t passId[2] = { minPassId[0], minPassId[1] }; + for (uint32_t i = minPassId[0]; i < maxPassId[0]+1; i++) { + for (uint32_t j = minPassId[1]; j < maxPassId[1]+1; j++) { + if (numPasses[i][j] < numPasses[passId[0]][passId[1]]) { + passId[0] = i; + passId[1] = j; + } + } + } + FFTPlan->numAxisUploads[axis_id] = numPasses[passId[0]][passId[1]]; + if (axis_upload_id >= numPasses[passId[0]][passId[1]]) + return; + if (axis_upload_id == 0) { + //first pass is non-strided, special case + switch (configuration.radix) { + case 8: { + uint32_t logSize0Pass = (12 + passId[0] < logSize) ? 12 + passId[0] : logSize; //4096 + shift + if ((axis_upload_id + 1 == numPasses[passId[0]][passId[1]] - 1) && (logSize - logSize0Pass < 3)) + logSize0Pass -= (3 - (logSize - logSize0Pass)); + uint32_t stage8 = logSize0Pass / 3; + uint32_t stage4 = 0; + uint32_t stage2 = 0; + if (logSize0Pass % 3 == 2) + stage4 = 1; + if (logSize0Pass % 3 == 1) + stage2 = 1; + uint32_t totNumStages = stage8 + stage4 + stage2; + + axis->specializationConstants.numStages = stage8; + axis->specializationConstants.fftDim = pow(8, stage8); + axis->specializationConstants.stageRadix[0] = 8; + axis->specializationConstants.stageRadix[1] = 8; + + if (stage4 == 1) { + axis->specializationConstants.numStages++; + axis->specializationConstants.stageRadix[1] = 4; + axis->specializationConstants.fftDim *= 4; + } + if (stage2 == 1) { + axis->specializationConstants.numStages++; + axis->specializationConstants.stageRadix[1] = 2; + axis->specializationConstants.fftDim *= 2; + } + axis->specializationConstants.stageStartSize = 1; + if (configuration.performR2C) + axis->specializationConstants.fft_dim_x = configuration.size[0] / 2; + else + axis->specializationConstants.fft_dim_x = configuration.size[0]; + break; + } + case 4: { + uint32_t stage4 = logSize / 2; + uint32_t stage2 = 0; + if (logSize % 2 == 1) + stage2 = 1; + axis->specializationConstants.numStages = stage4 + stage2; - axis->specializationConstants.stageRadix[0] = 4; - axis->specializationConstants.stageRadix[1] = 4; - if (logSize % 2 == 1) - axis->specializationConstants.stageRadix[1] = 2; - break; - } - case 2: { - uint32_t stage2 = logSize; - axis->specializationConstants.numStages = stage2; + axis->specializationConstants.stageRadix[0] = 4; + axis->specializationConstants.stageRadix[1] = 4; + if (logSize % 2 == 1) + axis->specializationConstants.stageRadix[1] = 2; + break; + } + case 2: { + uint32_t stage2 = logSize; + axis->specializationConstants.numStages = stage2; - axis->specializationConstants.stageRadix[0] = 2; - axis->specializationConstants.stageRadix[1] = 2; - break; - } + + axis->specializationConstants.stageRadix[0] = 2; + axis->specializationConstants.stageRadix[1] = 2; + break; + } + } + } + else { + //passes after first are done similar to strided passes in y and z + uint32_t logSizeLaterPass = (logSize - 12 - passId[0]<3) ? 3 : logSize - 12 - passId[0]; //4096 + shift + switch (configuration.radix) { + case 8: { + uint32_t stage8 = logSizeLaterPass / 3; + uint32_t stage4 = 0; + uint32_t stage2 = 0; + if (logSizeLaterPass % 3 == 2) + stage4 = 1; + if (logSizeLaterPass % 3 == 1) + stage2 = 1; + uint32_t totNumStages = stage8 + stage4 + stage2; + uint32_t locNumStages = 0; + if (passId[1] == minPassId[1]) { + locNumStages = stage8 / (numPasses[passId[0]][passId[1]] - 1); + if (axis_upload_id < stage8 % (numPasses[passId[0]][passId[1]] - 1)) + locNumStages++; + axis->specializationConstants.numStages = locNumStages; + axis->specializationConstants.fftDim = pow(8, locNumStages); + axis->specializationConstants.stageRadix[0] = 8; + axis->specializationConstants.stageRadix[1] = 8; + + if (axis_upload_id == (numPasses[passId[0]][passId[1]] - 1)) { + if (stage4 == 1) { + axis->specializationConstants.numStages++; + axis->specializationConstants.stageRadix[1] = 4; + axis->specializationConstants.fftDim *= 4; + } + if (stage2 == 1) { + axis->specializationConstants.numStages++; + axis->specializationConstants.stageRadix[1] = 2; + axis->specializationConstants.fftDim *= 2; + } + } + axis->specializationConstants.stageStartSize = FFTPlan->axes[axis_id][axis_upload_id - 1].specializationConstants.stageStartSize * FFTPlan->axes[axis_id][axis_upload_id - 1].specializationConstants.fftDim; + if (configuration.performR2C) + axis->specializationConstants.fft_dim_x = configuration.size[0] / 2; + else + axis->specializationConstants.fft_dim_x = configuration.size[0]; + } + else { + if (axis_upload_id < numPasses[passId[0]][passId[1]] - 1) { + uint32_t locLogSize = 8 + passId[1]; + if ((axis_upload_id + 1 == numPasses[passId[0]][passId[1]] - 1) && (logSizeLaterPass - (8 + passId[1]) * (numPasses[passId[0]][passId[1]] - 2) < 3)) + locLogSize -= (3 - (logSizeLaterPass - (8 + passId[1]) * (numPasses[passId[0]][passId[1]] - 2))); + uint32_t locStage8 = locLogSize / 3; + uint32_t locStage4 = 0; + uint32_t locStage2 = 0; + if (locLogSize % 3 == 2) + locStage4 = 1; + if (locLogSize % 3 == 1) + locStage2 = 1; + axis->specializationConstants.numStages = locStage8 + locStage4 + locStage2; + axis->specializationConstants.fftDim = pow(2, locLogSize); + axis->specializationConstants.stageRadix[0] = 8; + axis->specializationConstants.stageRadix[1] = 8; + + if (locStage4 == 1) { + axis->specializationConstants.stageRadix[1] = 4; + } + if (locStage2 == 1) { + axis->specializationConstants.stageRadix[1] = 2; + } + axis->specializationConstants.stageStartSize = FFTPlan->axes[axis_id][axis_upload_id - 1].specializationConstants.stageStartSize * FFTPlan->axes[axis_id][axis_upload_id - 1].specializationConstants.fftDim; + if (configuration.performR2C) + axis->specializationConstants.fft_dim_x = configuration.size[0] / 2; + else + axis->specializationConstants.fft_dim_x = configuration.size[0]; + } + else { + uint32_t locLogSize = (logSizeLaterPass - (8 + passId[1]) * (numPasses[passId[0]][passId[1]] - 2) < 3) ? 3 : logSizeLaterPass - (8 + passId[1]) * (numPasses[passId[0]][passId[1]] - 2); + uint32_t locStage8 = locLogSize / 3; + uint32_t locStage4 = 0; + uint32_t locStage2 = 0; + if (locLogSize % 3 == 2) + locStage4 = 1; + if (locLogSize % 3 == 1) + locStage2 = 1; + axis->specializationConstants.numStages = locStage8 + locStage4 + locStage2; + axis->specializationConstants.fftDim = pow(2, locLogSize); + axis->specializationConstants.stageRadix[0] = 8; + axis->specializationConstants.stageRadix[1] = 8; + + if (locStage4 == 1) { + axis->specializationConstants.stageRadix[1] = 4; + } + if (locStage2 == 1) { + axis->specializationConstants.stageRadix[1] = 2; + } + axis->specializationConstants.stageStartSize = FFTPlan->axes[axis_id][axis_upload_id - 1].specializationConstants.stageStartSize * FFTPlan->axes[axis_id][axis_upload_id - 1].specializationConstants.fftDim; + if (configuration.performR2C) + axis->specializationConstants.fft_dim_x = configuration.size[0] / 2; + else + axis->specializationConstants.fft_dim_x = configuration.size[0]; + } + } + + + break; + } + case 4: { + uint32_t stage4 = logSize / 2; + uint32_t stage2 = 0; + if (logSize % 2 == 1) + stage2 = 1; + axis->specializationConstants.numStages = stage4 + stage2; + + + axis->specializationConstants.stageRadix[0] = 4; + axis->specializationConstants.stageRadix[1] = 4; + if (logSize % 2 == 1) + axis->specializationConstants.stageRadix[1] = 2; + break; + } + case 2: { + uint32_t stage2 = logSize; + + axis->specializationConstants.numStages = stage2; + + + axis->specializationConstants.stageRadix[0] = 2; + axis->specializationConstants.stageRadix[1] = 2; + break; + } + } + } + }else{ + //configure radix stages + uint32_t logSize = log2(configuration.size[axis_id]); + uint32_t numPasses[8] = { 0,0,0,0,0,0,0,0 };//256-512-1024-2048(256KB)-4096(256KB)-8k(future?)-16k(future?) - find correct strided FFT configuration + uint32_t temp = configuration.size[axis_id]; + uint32_t startStage = 256; + uint32_t maxSingleSize = 8 * 4096 / configuration.coalescedMemory; + uint32_t maxPassId = log2(maxSingleSize / 256); + uint32_t minPassId = (maxSingleSize >= 512) ? 1 : 0; + //maxPassId += log2(configuration.registerBoost);//in development + for (uint32_t i = 0; i < 8; i++) { + while (temp > 1) + { + temp /= startStage; + numPasses[i]++; + } + temp = configuration.size[axis_id]; + startStage *= 2; + } + uint32_t passId = minPassId; + for (uint32_t i = minPassId; i < maxPassId+1; i++) { + if (numPasses[i] < numPasses[passId]) { + passId = i; + } + } + FFTPlan->numAxisUploads[axis_id] = numPasses[passId]; + if (axis_upload_id >= numPasses[passId]) + return; + switch (configuration.radix) { + case 8: { + uint32_t stage8 = logSize / 3; + uint32_t stage4 = 0; + uint32_t stage2 = 0; + if (logSize % 3 == 2) + stage4 = 1; + if (logSize % 3 == 1) + stage2 = 1; + uint32_t totNumStages = stage8 + stage4 + stage2; + uint32_t locNumStages = 0; + if (passId == minPassId) { + locNumStages = stage8 / numPasses[passId]; + if (axis_upload_id < stage8 % numPasses[passId]) + locNumStages++; + axis->specializationConstants.numStages = locNumStages; + axis->specializationConstants.fftDim = pow(8, locNumStages); + axis->specializationConstants.stageRadix[0] = 8; + axis->specializationConstants.stageRadix[1] = 8; + + if (axis_upload_id == numPasses[passId] - 1) { + if (stage4 == 1) { + axis->specializationConstants.numStages++; + axis->specializationConstants.stageRadix[1] = 4; + axis->specializationConstants.fftDim *= 4; + } + if (stage2 == 1) { + axis->specializationConstants.numStages++; + axis->specializationConstants.stageRadix[1] = 2; + axis->specializationConstants.fftDim *= 2; + } + } + axis->specializationConstants.stageStartSize = (axis_upload_id == 0) ? 1 : FFTPlan->axes[axis_id][axis_upload_id - 1].specializationConstants.stageStartSize * FFTPlan->axes[axis_id][axis_upload_id - 1].specializationConstants.fftDim; + if (configuration.performR2C) + axis->specializationConstants.fft_dim_x = configuration.size[0] / 2; + else + axis->specializationConstants.fft_dim_x = configuration.size[0]; + } + else { + if (axis_upload_id < numPasses[passId] - 1) { + + uint32_t locLogSize = 8 + passId; + if ((axis_upload_id + 1 == numPasses[passId] - 1) && (logSize - (8 + passId) * (numPasses[passId] - 1) < 3)) + locLogSize -= (3 - (logSize - (8 + passId) * (numPasses[passId] - 1))); + uint32_t locStage8 = locLogSize / 3; + uint32_t locStage4 = 0; + uint32_t locStage2 = 0; + if (locLogSize % 3 == 2) + locStage4 = 1; + if (locLogSize % 3 == 1) + locStage2 = 1; + axis->specializationConstants.numStages = locStage8 + locStage4 + locStage2; + axis->specializationConstants.fftDim = pow(2, locLogSize); + axis->specializationConstants.stageRadix[0] = 8; + axis->specializationConstants.stageRadix[1] = 8; + + if (locStage4 == 1) { + axis->specializationConstants.stageRadix[1] = 4; + } + if (locStage2 == 1) { + axis->specializationConstants.stageRadix[1] = 2; + } + axis->specializationConstants.stageStartSize = (axis_upload_id == 0) ? 1 : FFTPlan->axes[axis_id][axis_upload_id - 1].specializationConstants.stageStartSize * FFTPlan->axes[axis_id][axis_upload_id - 1].specializationConstants.fftDim; + if (configuration.performR2C) + axis->specializationConstants.fft_dim_x = configuration.size[0] / 2; + else + axis->specializationConstants.fft_dim_x = configuration.size[0]; + } + else { + uint32_t locLogSize = (logSize - (8 + passId)*(numPasses[passId] - 1)<3) ? 3 : logSize - (8 + passId) * (numPasses[passId] - 1); + uint32_t locStage8 = locLogSize / 3; + uint32_t locStage4 = 0; + uint32_t locStage2 = 0; + if (locLogSize % 3 == 2) + locStage4 = 1; + if (locLogSize % 3 == 1) + locStage2 = 1; + axis->specializationConstants.numStages = locStage8 + locStage4 + locStage2; + axis->specializationConstants.fftDim = pow(2, locLogSize); + axis->specializationConstants.stageRadix[0] = 8; + axis->specializationConstants.stageRadix[1] = 8; + + if (locStage4 == 1) { + axis->specializationConstants.stageRadix[1] = 4; + } + if (locStage2 == 1) { + axis->specializationConstants.stageRadix[1] = 2; + } + axis->specializationConstants.stageStartSize = (axis_upload_id == 0) ? 1 : FFTPlan->axes[axis_id][axis_upload_id - 1].specializationConstants.stageStartSize * FFTPlan->axes[axis_id][axis_upload_id - 1].specializationConstants.fftDim; + if (configuration.performR2C) + axis->specializationConstants.fft_dim_x = configuration.size[0] / 2; + else + axis->specializationConstants.fft_dim_x = configuration.size[0]; + } + } + + + break; + } + case 4: { + uint32_t stage4 = logSize / 2; + uint32_t stage2 = 0; + if (logSize % 2 == 1) + stage2 = 1; + axis->specializationConstants.numStages = stage4 + stage2; + + + axis->specializationConstants.stageRadix[0] = 4; + axis->specializationConstants.stageRadix[1] = 4; + if (logSize % 2 == 1) + axis->specializationConstants.stageRadix[1] = 2; + break; + } + case 2: { + uint32_t stage2 = logSize; + + axis->specializationConstants.numStages = stage2; + + + axis->specializationConstants.stageRadix[0] = 2; + axis->specializationConstants.stageRadix[1] = 2; + break; + } + } } - if (4096 / configuration.size[1] > configuration.coalescedMemory / 16) { + + //axis->groupedBatch = (4096 / axis->specializationConstants.fftDim >= configuration.coalescedMemory / 8) ? 4096 / axis->specializationConstants.fftDim : configuration.coalescedMemory / 8; + axis->specializationConstants.passID = FFTPlan->numAxisUploads[axis_id] - 1 - axis_upload_id; + axis->specializationConstants.fft_dim_full = configuration.size[axis_id]; + axis->groupedBatch = (4096 / axis->specializationConstants.fftDim >= configuration.coalescedMemory / 8) ? 4096 / axis->specializationConstants.fftDim : configuration.coalescedMemory / 8; + //axis->groupedBatch = ((axis_upload_id > 0) && (axis->groupedBatch > axis->specializationConstants.stageStartSize)) ? axis->specializationConstants.stageStartSize : axis->groupedBatch; + /*if (4096 / configuration.size[1] > configuration.coalescedMemory / 16) { configuration.performTranspose[0] = false; - FFTPlan->axes[1].groupedBatch = 4096 / configuration.size[1]; + FFTPlan->groupedBatch = 4096 / configuration.size[1]; } else { configuration.performTranspose[0] = true; @@ -326,7 +711,7 @@ typedef struct VkFFTApplication { } else { configuration.performTranspose[1] = true; - } + }*/ //configure strides if (configuration.performR2C) { @@ -378,7 +763,7 @@ typedef struct VkFFTApplication { axis->specializationConstants.outputStride[2] = axis->specializationConstants.inputStride[2]; axis->specializationConstants.outputStride[3] = axis->specializationConstants.inputStride[3]; if (axis_id == 0) { - if ((configuration.isInputFormatted) && (!inverse)) { + if ((axis_upload_id==0)&&(configuration.isInputFormatted) && (!inverse)) { if (configuration.performZeropadding[0]) axis->specializationConstants.inputStride[1] = configuration.size[0] / 2; @@ -392,7 +777,7 @@ typedef struct VkFFTApplication { else axis->specializationConstants.inputStride[3] = axis->specializationConstants.inputStride[2] * configuration.size[2]; } - if ((configuration.isOutputFormatted) && ((inverse) || ((configuration.performConvolution) && (configuration.FFTdim == 1)))) { + if ((axis_upload_id == FFTPlan->numAxisUploads[axis_id]-1) && (configuration.isOutputFormatted) && ((inverse) || ((configuration.performConvolution) && (configuration.FFTdim == 1)))) { if (configuration.performZeropadding[0]) axis->specializationConstants.outputStride[1] = configuration.size[0] / 2; @@ -457,7 +842,7 @@ typedef struct VkFFTApplication { axis->specializationConstants.outputStride[2] = axis->specializationConstants.inputStride[2]; axis->specializationConstants.outputStride[3] = axis->specializationConstants.inputStride[3]; if (axis_id == 0) { - if ((configuration.isInputFormatted) && (!inverse)) { + if ((axis_upload_id == 0) && (configuration.isInputFormatted) && (!inverse)) { if (configuration.performZeropadding[0]) axis->specializationConstants.inputStride[1] = configuration.size[0] / 2; @@ -471,7 +856,7 @@ typedef struct VkFFTApplication { else axis->specializationConstants.inputStride[3] = axis->specializationConstants.inputStride[2] * configuration.size[2]; } - if ((configuration.isOutputFormatted) && ((inverse) || ((configuration.performConvolution) && (configuration.FFTdim == 1)))) { + if ((axis_upload_id == FFTPlan->numAxisUploads[axis_id]-1) && (configuration.isOutputFormatted) && ((inverse) || ((configuration.performConvolution) && (configuration.FFTdim == 1)))) { if (configuration.performZeropadding[0]) axis->specializationConstants.outputStride[1] = configuration.size[0] / 2; @@ -489,10 +874,6 @@ typedef struct VkFFTApplication { } axis->specializationConstants.inputStride[4] = axis->specializationConstants.inputStride[3] * configuration.coordinateFeatures; axis->specializationConstants.outputStride[4] = axis->specializationConstants.outputStride[3] * configuration.coordinateFeatures; - for (uint32_t i = 0; i < 3; ++i) { - axis->specializationConstants.radixStride[i] = configuration.size[axis_id] / pow(2, i + 1); - - } axis->specializationConstants.inverse = inverse; axis->specializationConstants.zeropad[0] = configuration.performZeropadding[axis_id]; @@ -500,7 +881,7 @@ typedef struct VkFFTApplication { axis->specializationConstants.zeropad[1] = configuration.performZeropadding[axis_id + 1]; else axis->specializationConstants.zeropad[1] = false; - + //not needed anymore as we don't transpose if (!inverse) { switch (axis_id) { case 0: @@ -698,11 +1079,11 @@ typedef struct VkFFTApplication { VkDescriptorPoolSize descriptorPoolSize = { VK_DESCRIPTOR_TYPE_STORAGE_BUFFER }; descriptorPoolSize.descriptorCount = 2; - if ((axis_id == 0) && (configuration.FFTdim == 1) && (configuration.performConvolution)) + if ((axis_id == 0) && (axis_upload_id == 0) && (configuration.FFTdim == 1) && (configuration.performConvolution)) descriptorPoolSize.descriptorCount = 3; - if ((axis_id == 1) && (configuration.FFTdim == 2) && (configuration.performConvolution)) + if ((axis_id == 1) && (axis_upload_id == 0) && (configuration.FFTdim == 2) && (configuration.performConvolution)) descriptorPoolSize.descriptorCount = 3; - if ((axis_id == 2) && (configuration.FFTdim == 3) && (configuration.performConvolution)) + if ((axis_id == 2) && (axis_upload_id == 0) && (configuration.FFTdim == 3) && (configuration.performConvolution)) descriptorPoolSize.descriptorCount = 3; VkDescriptorPoolCreateInfo descriptorPoolCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO }; @@ -736,15 +1117,15 @@ typedef struct VkFFTApplication { VkDescriptorBufferInfo descriptorBufferInfo = {}; if (i == 0) { - if (configuration.isInputFormatted && ( - ((axis_id == 0) && (!inverse)) - || ((axis_id == configuration.FFTdim-1) && (inverse))) + if ((axis_upload_id == FFTPlan->numAxisUploads[axis_id]-1) && (configuration.isInputFormatted) && ( + ((axis_id == 0) && (!inverse) ) + || ((axis_id == configuration.FFTdim-1) && (inverse) && (!configuration.performConvolution))) ) { descriptorBufferInfo.buffer = configuration.inputBuffer[0]; descriptorBufferInfo.range = configuration.inputBufferSize[0]; } else { - if ((configuration.numberKernels > 1) && (inverse)) { + if ((axis_upload_id == 0) && (configuration.numberKernels > 1) && (inverse) && (!configuration.performConvolution)) { descriptorBufferInfo.buffer = configuration.outputBuffer[0]; descriptorBufferInfo.range = configuration.outputBufferSize[0]; } @@ -756,7 +1137,7 @@ typedef struct VkFFTApplication { descriptorBufferInfo.offset = 0; } if (i == 1) { - if ((configuration.isOutputFormatted && ( + if ((axis_upload_id == 0) && (configuration.isOutputFormatted && ( ((axis_id == 0) && (inverse)) || ((axis_id == configuration.FFTdim-1) && (!inverse) && (!configuration.performConvolution)) || ((axis_id == 0) && (configuration.performConvolution) && (configuration.FFTdim == 1))) @@ -805,143 +1186,172 @@ typedef struct VkFFTApplication { vkCreatePipelineLayout(configuration.device[0], &pipelineLayoutCreateInfo, NULL, &axis->pipelineLayout); if (!inverse) { if (axis_id == 0) { - FFTPlan->axes[axis_id].axisBlock[0] = (configuration.size[axis_id] / 8 > 1) ? configuration.size[axis_id] / 8 : 1; - if (FFTPlan->axes[axis_id].axisBlock[0] > 512) FFTPlan->axes[axis_id].axisBlock[0] = 512; - if (configuration.performR2C) - FFTPlan->axes[axis_id].axisBlock[1] = (FFTPlan->axes[axis_id].specializationConstants.ratioDirection[1]) ? 1 : FFTPlan->axes[axis_id].specializationConstants.ratio[1] / 2; - else - FFTPlan->axes[axis_id].axisBlock[1] = (FFTPlan->axes[axis_id].specializationConstants.ratioDirection[1]) ? 1 : FFTPlan->axes[axis_id].specializationConstants.ratio[1]; + + if (axis_upload_id == 0) { + axis->axisBlock[0] = (axis->specializationConstants.fftDim / 8 > 1) ? axis->specializationConstants.fftDim / 8 : 1; + if (axis->axisBlock[0] > 512) axis->axisBlock[0] = 512; + + axis->axisBlock[1] = 1; + axis->axisBlock[2] = 1; + axis->axisBlock[3] = axis->specializationConstants.fftDim; + } + else { + axis->axisBlock[1] = (axis->specializationConstants.fftDim / 8 > 1) ? axis->specializationConstants.fftDim / 8 : 1; + + axis->axisBlock[0] = (axis->specializationConstants.stageStartSize > axis->groupedBatch) ? axis->groupedBatch : axis->specializationConstants.stageStartSize; + + axis->axisBlock[2] = 1; + axis->axisBlock[3] = axis->specializationConstants.fftDim; + } - FFTPlan->axes[axis_id].axisBlock[2] = 1; - FFTPlan->axes[axis_id].axisBlock[3] = configuration.size[axis_id]; } if (axis_id == 1) { - if (configuration.performTranspose[0]) { - VkFFTPlanTranspose(FFTPlan, 0, inverse); - FFTPlan->axes[axis_id].axisBlock[0] = (configuration.size[axis_id] / 8 > 1) ? configuration.size[axis_id] / 8 : 1; - if (FFTPlan->axes[axis_id].axisBlock[0] > 512) FFTPlan->axes[axis_id].axisBlock[0] = 512; - FFTPlan->axes[axis_id].axisBlock[1] = (FFTPlan->axes[axis_id].specializationConstants.ratioDirection[0]) ? FFTPlan->axes[axis_id].specializationConstants.ratio[0] : 1; - FFTPlan->axes[axis_id].axisBlock[2] = 1; - FFTPlan->axes[axis_id].axisBlock[3] = configuration.size[axis_id]; + + axis->axisBlock[1] = (axis->specializationConstants.fftDim / 8 > 1) ? axis->specializationConstants.fftDim / 8 : 1; + + if (configuration.performR2C) { + if (axis_upload_id == 0) { + for (uint32_t i = 0; i < 8; i++) + VkFFTPlanSupportAxis(FFTPlan, 1, i, inverse); + } + axis->axisBlock[0] = (configuration.size[0] / 2 > axis->groupedBatch) ? axis->groupedBatch : configuration.size[0] / 2; + /*if (axis->axisBlock[0] * axis->axisBlock[1] < 64) + if (configuration.size[0]/2 > 64 / axis->axisBlock[1]) + axis->axisBlock[0] = 64 / axis->axisBlock[1]; + else + axis->axisBlock[0] = configuration.size[0]/2;*/ } else { - if (configuration.performR2C) { - VkFFTPlanSupportAxis(FFTPlan, 1, inverse); - FFTPlan->axes[axis_id].axisBlock[0] = (configuration.size[0] / 2 > FFTPlan->axes[axis_id].groupedBatch) ? FFTPlan->axes[axis_id].groupedBatch : configuration.size[0] / 2; - } - else - FFTPlan->axes[axis_id].axisBlock[0] = (configuration.size[0] > FFTPlan->axes[axis_id].groupedBatch) ? FFTPlan->axes[axis_id].groupedBatch : configuration.size[0]; - FFTPlan->axes[axis_id].axisBlock[1] = (configuration.size[axis_id] / 8 > 1) ? configuration.size[axis_id] / 8 : 1; - FFTPlan->axes[axis_id].axisBlock[2] = 1; - FFTPlan->axes[axis_id].axisBlock[3] = configuration.size[axis_id]; + axis->axisBlock[0] = (configuration.size[0] > axis->groupedBatch) ? axis->groupedBatch : configuration.size[0]; + /*if (axis->axisBlock[0] * axis->axisBlock[1] < 64) + if (configuration.size[0] > 64 / axis->axisBlock[1]) + axis->axisBlock[0] = 64 / axis->axisBlock[1]; + else + axis->axisBlock[0] = configuration.size[0];*/ } + + axis->axisBlock[2] = 1; + axis->axisBlock[3] = axis->specializationConstants.fftDim; + } if (axis_id == 2) { - if (configuration.performTranspose[1]) { - VkFFTPlanTranspose(FFTPlan, 1, inverse); - FFTPlan->axes[axis_id].axisBlock[0] = (configuration.size[axis_id] / 8 > 1) ? configuration.size[axis_id] / 8 : 1; - FFTPlan->axes[axis_id].axisBlock[1] = (FFTPlan->axes[axis_id].specializationConstants.ratioDirection[0]) ? FFTPlan->axes[axis_id].specializationConstants.ratio[0] : 1; - FFTPlan->axes[axis_id].axisBlock[2] = 1; - FFTPlan->axes[axis_id].axisBlock[3] = configuration.size[axis_id]; + axis->axisBlock[1] = (axis->specializationConstants.fftDim / 8 > 1) ? axis->specializationConstants.fftDim / 8 : 1; + + if (configuration.performR2C) { + if (axis_upload_id == 0) { + for (uint32_t i = 0; i < 8; i++) + VkFFTPlanSupportAxis(FFTPlan, 2, i, inverse); + } + axis->axisBlock[0] = (configuration.size[0] / 2 > axis->groupedBatch) ? axis->groupedBatch : configuration.size[0] / 2; + /*if (axis->axisBlock[0] * axis->axisBlock[1] < 64) + if (configuration.size[0] / 2 > 64 / axis->axisBlock[1]) + axis->axisBlock[0] = 64 / axis->axisBlock[1]; + else + axis->axisBlock[0] = configuration.size[0] / 2;*/ } else { - if (configuration.performTranspose[0]) { - FFTPlan->axes[axis_id].axisBlock[0] = (configuration.size[1] > FFTPlan->axes[axis_id].groupedBatch) ? FFTPlan->axes[axis_id].groupedBatch : configuration.size[1]; - FFTPlan->axes[axis_id].axisBlock[1] = (configuration.size[axis_id] / 8 > 1) ? configuration.size[axis_id] / 8 : 1; - FFTPlan->axes[axis_id].axisBlock[2] = 1; - FFTPlan->axes[axis_id].axisBlock[3] = configuration.size[axis_id]; - } - else { - if (configuration.performR2C) { - VkFFTPlanSupportAxis(FFTPlan, 2, inverse); - FFTPlan->axes[axis_id].axisBlock[0] = (configuration.size[0] / 2 > FFTPlan->axes[axis_id].groupedBatch) ? FFTPlan->axes[axis_id].groupedBatch : configuration.size[0] / 2; - } + axis->axisBlock[0] = (configuration.size[0] > axis->groupedBatch) ? axis->groupedBatch : configuration.size[0]; + /*if (axis->axisBlock[0] * axis->axisBlock[1] < 64) + if (configuration.size[0] > 64 / axis->axisBlock[1]) + axis->axisBlock[0] = 64 / axis->axisBlock[1]; else - FFTPlan->axes[axis_id].axisBlock[0] = (configuration.size[0] > FFTPlan->axes[axis_id].groupedBatch) ? FFTPlan->axes[axis_id].groupedBatch : configuration.size[0]; - FFTPlan->axes[axis_id].axisBlock[1] = (configuration.size[axis_id] / 8 > 1) ? configuration.size[axis_id] / 8 : 1; - FFTPlan->axes[axis_id].axisBlock[2] = 1; - FFTPlan->axes[axis_id].axisBlock[3] = configuration.size[axis_id]; - } + axis->axisBlock[0] = configuration.size[0];*/ } + axis->axisBlock[2] = 1; + axis->axisBlock[3] = axis->specializationConstants.fftDim; } } else { if (axis_id == 0) { - FFTPlan->axes[axis_id].axisBlock[0] = (configuration.size[axis_id] / 8 > 1) ? configuration.size[axis_id] / 8 : 1; - if (FFTPlan->axes[axis_id].axisBlock[0] > 512) FFTPlan->axes[axis_id].axisBlock[0] = 512; - if (configuration.performR2C) - FFTPlan->axes[axis_id].axisBlock[1] = (FFTPlan->axes[axis_id].specializationConstants.ratioDirection[0]) ? FFTPlan->axes[axis_id].specializationConstants.ratio[0] / 2 : 1; - else - FFTPlan->axes[axis_id].axisBlock[1] = (FFTPlan->axes[axis_id].specializationConstants.ratioDirection[0]) ? FFTPlan->axes[axis_id].specializationConstants.ratio[0] : 1; - FFTPlan->axes[axis_id].axisBlock[2] = 1; - FFTPlan->axes[axis_id].axisBlock[3] = configuration.size[axis_id]; + if (axis_upload_id == 0) { + axis->axisBlock[0] = (axis->specializationConstants.fftDim / 8 > 1) ? axis->specializationConstants.fftDim / 8 : 1; + if (axis->axisBlock[0] > 512) axis->axisBlock[0] = 512; + + axis->axisBlock[1] = 1; + axis->axisBlock[2] = 1; + axis->axisBlock[3] = axis->specializationConstants.fftDim; + } + else { + axis->axisBlock[1] = (axis->specializationConstants.fftDim / 8 > 1) ? axis->specializationConstants.fftDim / 8 : 1; + + axis->axisBlock[0] = (axis->specializationConstants.stageStartSize > axis->groupedBatch) ? axis->groupedBatch : axis->specializationConstants.stageStartSize; + + axis->axisBlock[2] = 1; + axis->axisBlock[3] = axis->specializationConstants.fftDim; + } } if (axis_id == 1) { - if (configuration.performTranspose[0]) { - VkFFTPlanTranspose(FFTPlan, 0, inverse); - FFTPlan->axes[axis_id].axisBlock[0] = (configuration.size[axis_id] / 8 > 1) ? configuration.size[axis_id] / 8 : 1; - if (FFTPlan->axes[axis_id].axisBlock[0] > 512) FFTPlan->axes[axis_id].axisBlock[0] = 512; - FFTPlan->axes[axis_id].axisBlock[1] = (FFTPlan->axes[axis_id].specializationConstants.ratioDirection[1]) ? 1 : FFTPlan->axes[axis_id].specializationConstants.ratio[1]; - FFTPlan->axes[axis_id].axisBlock[2] = 1; - FFTPlan->axes[axis_id].axisBlock[3] = configuration.size[axis_id]; + + axis->axisBlock[1] = (axis->specializationConstants.fftDim / 8 > 1) ? axis->specializationConstants.fftDim / 8 : 1; + + if (configuration.performR2C) { + if (axis_upload_id == 0) { + for (uint32_t i = 0; i < 8; i++) + VkFFTPlanSupportAxis(FFTPlan, 1, i, inverse); + } + axis->axisBlock[0] = (configuration.size[0] / 2 > axis->groupedBatch) ? axis->groupedBatch : configuration.size[0] / 2; + /*if (axis->axisBlock[0] * axis->axisBlock[1] < 64) + if (configuration.size[0] / 2 > 64 / axis->axisBlock[1]) + axis->axisBlock[0] = 64 / axis->axisBlock[1]; + else + axis->axisBlock[0] = configuration.size[0] / 2;*/ } else { - if (configuration.performR2C) { - VkFFTPlanSupportAxis(FFTPlan, 1, inverse); - FFTPlan->axes[axis_id].axisBlock[0] = (configuration.size[0] / 2 > FFTPlan->axes[axis_id].groupedBatch) ? FFTPlan->axes[axis_id].groupedBatch : configuration.size[0] / 2; - } - else - FFTPlan->axes[axis_id].axisBlock[0] = (configuration.size[0] > FFTPlan->axes[axis_id].groupedBatch) ? FFTPlan->axes[axis_id].groupedBatch : configuration.size[0]; - FFTPlan->axes[axis_id].axisBlock[1] = (configuration.size[axis_id] / 8 > 1) ? configuration.size[axis_id] / 8 : 1; - FFTPlan->axes[axis_id].axisBlock[2] = 1; - FFTPlan->axes[axis_id].axisBlock[3] = configuration.size[axis_id]; + axis->axisBlock[0] = (configuration.size[0] > axis->groupedBatch) ? axis->groupedBatch : configuration.size[0]; + /*if (axis->axisBlock[0] * axis->axisBlock[1] < 64) + if (configuration.size[0] > 64 / axis->axisBlock[1]) + axis->axisBlock[0] = 64 / axis->axisBlock[1]; + else + axis->axisBlock[0] = configuration.size[0];*/ } + axis->axisBlock[2] = 1; + axis->axisBlock[3] = axis->specializationConstants.fftDim; + } if (axis_id == 2) { - if (configuration.performTranspose[1]) { - VkFFTPlanTranspose(FFTPlan, 1, inverse); - FFTPlan->axes[axis_id].axisBlock[0] = (configuration.size[axis_id] / 8 > 1) ? configuration.size[axis_id] / 8 : 1; - FFTPlan->axes[axis_id].axisBlock[1] = (FFTPlan->axes[axis_id].specializationConstants.ratioDirection[1]) ? 1 : FFTPlan->axes[axis_id].specializationConstants.ratio[1]; - FFTPlan->axes[axis_id].axisBlock[2] = 1; - FFTPlan->axes[axis_id].axisBlock[3] = configuration.size[axis_id]; + + axis->axisBlock[1] = (axis->specializationConstants.fftDim / 8 > 1) ? axis->specializationConstants.fftDim / 8 : 1; + + if (configuration.performR2C) { + if (axis_upload_id == 0) { + for (uint32_t i = 0; i < 8; i++) + VkFFTPlanSupportAxis(FFTPlan, 2, i, inverse); + } + axis->axisBlock[0] = (configuration.size[0] / 2 > axis->groupedBatch) ? axis->groupedBatch : configuration.size[0] / 2; + /*if (axis->axisBlock[0] * axis->axisBlock[1] < 64) + if (configuration.size[0] / 2 > 64 / axis->axisBlock[1]) + axis->axisBlock[0] = 64 / axis->axisBlock[1]; + else + axis->axisBlock[0] = configuration.size[0] / 2;*/ } else { - if (configuration.performTranspose[0]) { - FFTPlan->axes[axis_id].axisBlock[0] = (configuration.size[1] > FFTPlan->axes[axis_id].groupedBatch) ? FFTPlan->axes[axis_id].groupedBatch : configuration.size[1]; - FFTPlan->axes[axis_id].axisBlock[1] = (configuration.size[axis_id] / 8 > 1) ? configuration.size[axis_id] / 8 : 1; - FFTPlan->axes[axis_id].axisBlock[2] = 1; - FFTPlan->axes[axis_id].axisBlock[3] = configuration.size[axis_id]; - } - else { - if (configuration.performR2C) { - VkFFTPlanSupportAxis(FFTPlan, 2, inverse); - FFTPlan->axes[axis_id].axisBlock[0] = (configuration.size[0] / 2 > FFTPlan->axes[axis_id].groupedBatch) ? FFTPlan->axes[axis_id].groupedBatch : configuration.size[0] / 2; - } + axis->axisBlock[0] = (configuration.size[0] > axis->groupedBatch) ? axis->groupedBatch : configuration.size[0]; + /*if (axis->axisBlock[0] * axis->axisBlock[1] < 64) + if (configuration.size[0] > 64 / axis->axisBlock[1]) + axis->axisBlock[0] = 64 / axis->axisBlock[1]; else - FFTPlan->axes[axis_id].axisBlock[0] = (configuration.size[0] > FFTPlan->axes[axis_id].groupedBatch) ? FFTPlan->axes[axis_id].groupedBatch : configuration.size[0]; - FFTPlan->axes[axis_id].axisBlock[1] = (configuration.size[axis_id] / 8 > 1) ? configuration.size[axis_id] / 8 : 1; - FFTPlan->axes[axis_id].axisBlock[2] = 1; - FFTPlan->axes[axis_id].axisBlock[3] = configuration.size[axis_id]; - } + axis->axisBlock[0] = configuration.size[0];*/ } + axis->axisBlock[2] = 1; + axis->axisBlock[3] = axis->specializationConstants.fftDim; + } } - VkSpecializationMapEntry specializationMapEntries[29] = { {} }; - for (uint32_t i = 0; i < 29; i++) { + VkSpecializationMapEntry specializationMapEntries[30] = { {} }; + for (uint32_t i = 0; i < 30; i++) { specializationMapEntries[i].constantID = i + 1; specializationMapEntries[i].size = sizeof(uint32_t); specializationMapEntries[i].offset = i * sizeof(uint32_t); } VkSpecializationInfo specializationInfo = {}; - specializationInfo.dataSize = 29 * sizeof(uint32_t); - specializationInfo.mapEntryCount = 29; + specializationInfo.dataSize = 30 * sizeof(uint32_t); + specializationInfo.mapEntryCount = 30; specializationInfo.pMapEntries = specializationMapEntries; - FFTPlan->axes[axis_id].specializationConstants.localSize[0] = FFTPlan->axes[axis_id].axisBlock[0]; - FFTPlan->axes[axis_id].specializationConstants.localSize[1] = FFTPlan->axes[axis_id].axisBlock[1]; - FFTPlan->axes[axis_id].specializationConstants.localSize[2] = FFTPlan->axes[axis_id].axisBlock[2]; - FFTPlan->axes[axis_id].specializationConstants.fftDim = FFTPlan->axes[axis_id].axisBlock[3]; - specializationInfo.pData = &FFTPlan->axes[axis_id].specializationConstants; + axis->specializationConstants.localSize[0] = axis->axisBlock[0]; + axis->specializationConstants.localSize[1] = axis->axisBlock[1]; + axis->specializationConstants.localSize[2] = axis->axisBlock[2]; + specializationInfo.pData = &axis->specializationConstants; VkPipelineShaderStageCreateInfo pipelineShaderStageCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO }; VkComputePipelineCreateInfo computePipelineCreateInfo = { VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO }; @@ -951,14 +1361,32 @@ typedef struct VkFFTApplication { if (configuration.performR2C) { if (axis_id == 0) { if (inverse) { - switch (configuration.size[axis_id]) { - case 8192: - VkFFTInitShader(23, &pipelineShaderStageCreateInfo.module); + switch (configuration.registerBoost) { + case 1: + { + VkFFTInitShader(1, &pipelineShaderStageCreateInfo.module); break; - default: - switch (configuration.size[axis_id+1]) { + } + case 2: + { + switch (axis->specializationConstants.fftDim) { + case 8192: + VkFFTInitShader(23, &pipelineShaderStageCreateInfo.module); + break; + default: + VkFFTInitShader(1, &pipelineShaderStageCreateInfo.module); + break; + } + break; + } + case 4: + { + switch (axis->specializationConstants.fftDim) { case 8192: - VkFFTInitShader(26, &pipelineShaderStageCreateInfo.module); + VkFFTInitShader(23, &pipelineShaderStageCreateInfo.module); + break; + case 16384: + VkFFTInitShader(33, &pipelineShaderStageCreateInfo.module); break; default: VkFFTInitShader(1, &pipelineShaderStageCreateInfo.module); @@ -966,16 +1394,35 @@ typedef struct VkFFTApplication { } break; } + } } else { - switch (configuration.size[axis_id]) { - case 8192: - VkFFTInitShader(24, &pipelineShaderStageCreateInfo.module); + switch (configuration.registerBoost) { + case 1: + { + VkFFTInitShader(3, &pipelineShaderStageCreateInfo.module); break; - default: - switch (configuration.size[axis_id + 1]) { + } + case 2: + { + switch (axis->specializationConstants.fftDim) { + case 8192: + VkFFTInitShader(24, &pipelineShaderStageCreateInfo.module); + break; + default: + VkFFTInitShader(3, &pipelineShaderStageCreateInfo.module); + break; + } + break; + } + case 4: + { + switch (axis->specializationConstants.fftDim) { case 8192: - VkFFTInitShader(27, &pipelineShaderStageCreateInfo.module); + VkFFTInitShader(24, &pipelineShaderStageCreateInfo.module); + break; + case 16384: + VkFFTInitShader(34, &pipelineShaderStageCreateInfo.module); break; default: VkFFTInitShader(3, &pipelineShaderStageCreateInfo.module); @@ -983,31 +1430,43 @@ typedef struct VkFFTApplication { } break; } + } } } if (axis_id == 1) { - if ((configuration.FFTdim == 2) && (configuration.performConvolution)) { - if (configuration.performTranspose[0]) + if ((configuration.FFTdim == 2) && (configuration.performConvolution)&&(axis_upload_id == 0)) { + + switch (configuration.matrixConvolution) { case 1: - VkFFTInitShader(10, &pipelineShaderStageCreateInfo.module); + VkFFTInitShader(8, &pipelineShaderStageCreateInfo.module); break; case 2: if (configuration.symmetricKernel) - VkFFTInitShader(13, &pipelineShaderStageCreateInfo.module); + VkFFTInitShader(11, &pipelineShaderStageCreateInfo.module); else - VkFFTInitShader(16, &pipelineShaderStageCreateInfo.module); + VkFFTInitShader(14, &pipelineShaderStageCreateInfo.module); break; case 3: if (configuration.symmetricKernel) - VkFFTInitShader(19, &pipelineShaderStageCreateInfo.module); + VkFFTInitShader(17, &pipelineShaderStageCreateInfo.module); else - VkFFTInitShader(22, &pipelineShaderStageCreateInfo.module); + VkFFTInitShader(20, &pipelineShaderStageCreateInfo.module); break; } - else + + } + else { + VkFFTInitShader(7, &pipelineShaderStageCreateInfo.module); + } + + } + + if (axis_id == 2) { + if ((configuration.FFTdim == 3) && (configuration.performConvolution)&&(axis_upload_id == 0)) { + switch (configuration.matrixConvolution) { case 1: VkFFTInitShader(8, &pipelineShaderStageCreateInfo.module); @@ -1025,48 +1484,18 @@ typedef struct VkFFTApplication { VkFFTInitShader(20, &pipelineShaderStageCreateInfo.module); break; } - + } else { - if (configuration.performTranspose[0]) { - switch (configuration.size[axis_id]) { - case 8192: - VkFFTInitShader(25, &pipelineShaderStageCreateInfo.module); - break; - default: - if (inverse) - VkFFTInitShader(6, &pipelineShaderStageCreateInfo.module); - else - VkFFTInitShader(5, &pipelineShaderStageCreateInfo.module); - break; - /*switch (configuration.size[axis_id - 1]) { - case 8192: - if (inverse) - VkFFTInitShader(30, &pipelineShaderStageCreateInfo.module); - else - VkFFTInitShader(29, &pipelineShaderStageCreateInfo.module); - break; - default: - if (inverse) - VkFFTInitShader(6, &pipelineShaderStageCreateInfo.module); - else - VkFFTInitShader(5, &pipelineShaderStageCreateInfo.module); - break; - } - break;*/ - } - - } - else { + VkFFTInitShader(7, &pipelineShaderStageCreateInfo.module); - } } - } - - if (axis_id == 2) { - if ((configuration.FFTdim == 3) && (configuration.performConvolution)) { - if (configuration.performTranspose[1]) + } + else { + if (axis_id == 0) { + if ((configuration.FFTdim == 1) && (configuration.performConvolution) && (axis_upload_id == 0)) { + if (axis_upload_id == 0) { switch (configuration.matrixConvolution) { case 1: VkFFTInitShader(9, &pipelineShaderStageCreateInfo.module); @@ -1084,114 +1513,77 @@ typedef struct VkFFTApplication { VkFFTInitShader(21, &pipelineShaderStageCreateInfo.module); break; } - else + } + else { switch (configuration.matrixConvolution) { case 1: - VkFFTInitShader(8, &pipelineShaderStageCreateInfo.module); + VkFFTInitShader(10, &pipelineShaderStageCreateInfo.module); break; case 2: if (configuration.symmetricKernel) - VkFFTInitShader(11, &pipelineShaderStageCreateInfo.module); + VkFFTInitShader(13, &pipelineShaderStageCreateInfo.module); else - VkFFTInitShader(14, &pipelineShaderStageCreateInfo.module); + VkFFTInitShader(16, &pipelineShaderStageCreateInfo.module); break; case 3: if (configuration.symmetricKernel) - VkFFTInitShader(17, &pipelineShaderStageCreateInfo.module); + VkFFTInitShader(19, &pipelineShaderStageCreateInfo.module); else - VkFFTInitShader(20, &pipelineShaderStageCreateInfo.module); + VkFFTInitShader(22, &pipelineShaderStageCreateInfo.module); break; } - + } } else { - if (configuration.performTranspose[1]) { - - switch (configuration.size[axis_id]) { - case 8192: - VkFFTInitShader(25, &pipelineShaderStageCreateInfo.module); - break; + switch (configuration.registerBoost) { + case 1: + { + if (axis_upload_id == 0) + VkFFTInitShader(0, &pipelineShaderStageCreateInfo.module); + else + VkFFTInitShader(2, &pipelineShaderStageCreateInfo.module); + break; + } + case 2: + { + switch (axis->specializationConstants.fftDim) { + case 8192: + VkFFTInitShader(25, &pipelineShaderStageCreateInfo.module); + break; default: - switch (configuration.size[axis_id - 1]) { - case 8192: - VkFFTInitShader(28, &pipelineShaderStageCreateInfo.module); - break; - default: + if (axis_upload_id == 0) VkFFTInitShader(0, &pipelineShaderStageCreateInfo.module); - break; - } + else + VkFFTInitShader(2, &pipelineShaderStageCreateInfo.module); break; } - - } - else - VkFFTInitShader(7, &pipelineShaderStageCreateInfo.module); - } - } - } - else { - if (axis_id == 0) { - if ((configuration.FFTdim == 1) && (configuration.performConvolution)) { - - switch (configuration.matrixConvolution) { - case 1: - VkFFTInitShader(9, &pipelineShaderStageCreateInfo.module); - break; - case 2: - if (configuration.symmetricKernel) - VkFFTInitShader(12, &pipelineShaderStageCreateInfo.module); - else - VkFFTInitShader(15, &pipelineShaderStageCreateInfo.module); - break; - case 3: - if (configuration.symmetricKernel) - VkFFTInitShader(18, &pipelineShaderStageCreateInfo.module); - else - VkFFTInitShader(21, &pipelineShaderStageCreateInfo.module); break; } - } - else { - switch (configuration.size[axis_id]) { - case 8192: - VkFFTInitShader(25, &pipelineShaderStageCreateInfo.module); - break; - default: - switch (configuration.size[axis_id + 1]) { + case 4: + { + switch (axis->specializationConstants.fftDim) { case 8192: - VkFFTInitShader(28, &pipelineShaderStageCreateInfo.module); + VkFFTInitShader(25, &pipelineShaderStageCreateInfo.module); + break; + case 16384: + VkFFTInitShader(35, &pipelineShaderStageCreateInfo.module); break; default: - VkFFTInitShader(0, &pipelineShaderStageCreateInfo.module); + if (axis_upload_id == 0) + VkFFTInitShader(0, &pipelineShaderStageCreateInfo.module); + else + VkFFTInitShader(2, &pipelineShaderStageCreateInfo.module); break; } break; } - + } } } if (axis_id == 1) { - if ((configuration.FFTdim == 2) && (configuration.performConvolution)) { - if (configuration.performTranspose[0]) - switch (configuration.matrixConvolution) { - case 1: - VkFFTInitShader(10, &pipelineShaderStageCreateInfo.module); - break; - case 2: - if (configuration.symmetricKernel) - VkFFTInitShader(13, &pipelineShaderStageCreateInfo.module); - else - VkFFTInitShader(16, &pipelineShaderStageCreateInfo.module); - break; - case 3: - if (configuration.symmetricKernel) - VkFFTInitShader(19, &pipelineShaderStageCreateInfo.module); - else - VkFFTInitShader(22, &pipelineShaderStageCreateInfo.module); - break; - } - else + if ((configuration.FFTdim == 2) && (configuration.performConvolution) && (axis_upload_id == 0)) { + switch (configuration.matrixConvolution) { case 1: VkFFTInitShader(8, &pipelineShaderStageCreateInfo.module); @@ -1209,53 +1601,18 @@ typedef struct VkFFTApplication { VkFFTInitShader(20, &pipelineShaderStageCreateInfo.module); break; } + } else { - if (configuration.performTranspose[0]) { - switch (configuration.size[axis_id]) { - case 8192: - VkFFTInitShader(25, &pipelineShaderStageCreateInfo.module); - break; - default: - switch (configuration.size[axis_id - 1]) { - case 8192: - VkFFTInitShader(28, &pipelineShaderStageCreateInfo.module); - break; - default: - VkFFTInitShader(0, &pipelineShaderStageCreateInfo.module); - break; - } - break; - } - } - else { VkFFTInitShader(7, &pipelineShaderStageCreateInfo.module); - } + } } if (axis_id == 2) { - if ((configuration.FFTdim == 3) && (configuration.performConvolution)) { - if (configuration.performTranspose[1]) - switch (configuration.matrixConvolution) { - case 1: - VkFFTInitShader(9, &pipelineShaderStageCreateInfo.module); - break; - case 2: - if (configuration.symmetricKernel) - VkFFTInitShader(12, &pipelineShaderStageCreateInfo.module); - else - VkFFTInitShader(15, &pipelineShaderStageCreateInfo.module); - break; - case 3: - if (configuration.symmetricKernel) - VkFFTInitShader(18, &pipelineShaderStageCreateInfo.module); - else - VkFFTInitShader(21, &pipelineShaderStageCreateInfo.module); - break; - } - else + if ((configuration.FFTdim == 3) && (configuration.performConvolution) && (axis_upload_id == 0)) { + switch (configuration.matrixConvolution) { case 1: VkFFTInitShader(8, &pipelineShaderStageCreateInfo.module); @@ -1273,26 +1630,10 @@ typedef struct VkFFTApplication { VkFFTInitShader(20, &pipelineShaderStageCreateInfo.module); break; } + } else { - if (configuration.performTranspose[1]) - switch (configuration.size[axis_id]) { - case 8192: - VkFFTInitShader(25, &pipelineShaderStageCreateInfo.module); - break; - default: - switch (configuration.size[axis_id - 1]) { - case 8192: - VkFFTInitShader(28, &pipelineShaderStageCreateInfo.module); - break; - default: - VkFFTInitShader(0, &pipelineShaderStageCreateInfo.module); - break; - } - break; - } - else - VkFFTInitShader(7, &pipelineShaderStageCreateInfo.module); + VkFFTInitShader(7, &pipelineShaderStageCreateInfo.module); } } } @@ -1310,60 +1651,399 @@ typedef struct VkFFTApplication { } - void VkFFTPlanSupportAxis(VkFFTPlan* FFTPlan, uint32_t axis_id, bool inverse) { + void VkFFTPlanSupportAxis(VkFFTPlan* FFTPlan, uint32_t axis_id, uint32_t axis_upload_id, bool inverse) { //get radix stages - VkFFTAxis* axis = &FFTPlan->supportAxes[axis_id - 1]; - //for (uint32_t i; i<3; i++) - // axis->specializationConstants.size[i] = configuration.size[i]; - - //configure radix stages - uint32_t logSize = log2(configuration.size[axis_id]); - - switch (configuration.radix) { - case 8: { - uint32_t stage8 = logSize / 3; - uint32_t stage4 = 0; - uint32_t stage2 = 0; - if (logSize % 3 == 2) - stage4 = 1; - if (logSize % 3 == 1) - stage2 = 1; - axis->specializationConstants.numStages = stage8 + stage4 + stage2; - - axis->specializationConstants.stageRadix[0] = 8; - axis->specializationConstants.stageRadix[1] = 8; - if (logSize % 3 == 2) - axis->specializationConstants.stageRadix[1] = 4; - if (logSize % 3 == 1) - axis->specializationConstants.stageRadix[1] = 2; - break; - } - case 4: { - uint32_t stage4 = logSize / 2; - uint32_t stage2 = 0; - if (logSize % 2 == 1) - stage2 = 1; - axis->specializationConstants.numStages = stage4 + stage2; + VkFFTAxis* axis = &FFTPlan->supportAxes[axis_id - 1][axis_upload_id]; + if (axis_id == 1) { + //configure radix stages + uint32_t logSize = log2(configuration.size[axis_id]); + uint32_t numPasses[8][8];//4096-8k(256KB)-16k(256KB)-32k-64k - find correct strided FFT configuration - x axis | 256-512-1024-2048(256KB)-4096(256KB)-8k(future?)-16k(future?) - find correct strided FFT configuration + for (uint32_t i = 0; i < 8; i++) { + for (uint32_t j = 0; j < 8; j++) { + numPasses[i][j] = 0; + } + } + uint32_t temp = configuration.size[axis_id]; + uint32_t startStage = 4096; + uint32_t continueStage = 256; + uint32_t maxPassId[2] = { 0,0 }; + uint32_t minPassId[2] = { 0,0 }; + maxPassId[0] += log2(configuration.registerBoost); + uint32_t maxSingleSize = 8 * 4096 / configuration.coalescedMemory; + maxPassId[1] = log2(maxSingleSize / 256); + minPassId[1] = (maxSingleSize >= 512) ? 1 : 0; + //maxPassId[1] += log2(configuration.registerBoost);//in development + for (uint32_t i = 0; i < 8; i++) { + for (uint32_t j = 0; j < 8; j++) { + temp /= startStage; + numPasses[i][j]++; + while (temp > 1) + { + temp /= continueStage; + numPasses[i][j]++; + } + continueStage *= 2; + temp = configuration.size[axis_id]; + } + continueStage = 256; + startStage *= 2; + } + uint32_t passId[2] = { minPassId[0], minPassId[1] }; + for (uint32_t i = minPassId[0]; i < maxPassId[0]+1; i++) { + for (uint32_t j = minPassId[1]; j < maxPassId[1]+1; j++) { + if (numPasses[i][j] < numPasses[passId[0]][passId[1]]) { + passId[0] = i; + passId[1] = j; + } + } + } + FFTPlan->numSupportAxisUploads[axis_id - 1] = numPasses[passId[0]][passId[1]]; + if (axis_upload_id >= numPasses[passId[0]][passId[1]]) + return; + if (axis_upload_id == 0) { + //first pass is non-strided, special case + switch (configuration.radix) { + case 8: { + uint32_t logSize0Pass = (12 + passId[0] < logSize) ? 12 + passId[0] : logSize; //4096 + shift + if ((axis_upload_id + 1 == numPasses[passId[0]][passId[1]] - 1) && (logSize - logSize0Pass < 3)) + logSize0Pass -= (3 - (logSize - logSize0Pass)); + uint32_t stage8 = logSize0Pass / 3; + uint32_t stage4 = 0; + uint32_t stage2 = 0; + if (logSize0Pass % 3 == 2) + stage4 = 1; + if (logSize0Pass % 3 == 1) + stage2 = 1; + uint32_t totNumStages = stage8 + stage4 + stage2; + + axis->specializationConstants.numStages = stage8; + axis->specializationConstants.fftDim = pow(8, stage8); + axis->specializationConstants.stageRadix[0] = 8; + axis->specializationConstants.stageRadix[1] = 8; + + if (stage4 == 1) { + axis->specializationConstants.numStages++; + axis->specializationConstants.stageRadix[1] = 4; + axis->specializationConstants.fftDim *= 4; + } + if (stage2 == 1) { + axis->specializationConstants.numStages++; + axis->specializationConstants.stageRadix[1] = 2; + axis->specializationConstants.fftDim *= 2; + } + axis->specializationConstants.stageStartSize = 1; + if (configuration.performR2C) + axis->specializationConstants.fft_dim_x = configuration.size[0] / 2; + else + axis->specializationConstants.fft_dim_x = configuration.size[0]; + break; + } + case 4: { + uint32_t stage4 = logSize / 2; + uint32_t stage2 = 0; + if (logSize % 2 == 1) + stage2 = 1; + axis->specializationConstants.numStages = stage4 + stage2; - axis->specializationConstants.stageRadix[0] = 4; - axis->specializationConstants.stageRadix[1] = 4; - if (logSize % 2 == 1) - axis->specializationConstants.stageRadix[1] = 2; - break; - } - case 2: { - uint32_t stage2 = logSize; - axis->specializationConstants.numStages = stage2; + axis->specializationConstants.stageRadix[0] = 4; + axis->specializationConstants.stageRadix[1] = 4; + if (logSize % 2 == 1) + axis->specializationConstants.stageRadix[1] = 2; + break; + } + case 2: { + uint32_t stage2 = logSize; + axis->specializationConstants.numStages = stage2; - axis->specializationConstants.stageRadix[0] = 2; - axis->specializationConstants.stageRadix[1] = 2; - break; - } + + axis->specializationConstants.stageRadix[0] = 2; + axis->specializationConstants.stageRadix[1] = 2; + break; + } + } + } + else { + //passes after first are done similar to strided passes in y and z + uint32_t logSizeLaterPass = (logSize - 12 - passId[0] < 3) ? 3 : logSize - 12 - passId[0]; //4096 + shift + switch (configuration.radix) { + case 8: { + uint32_t stage8 = logSizeLaterPass / 3; + uint32_t stage4 = 0; + uint32_t stage2 = 0; + if (logSizeLaterPass % 3 == 2) + stage4 = 1; + if (logSizeLaterPass % 3 == 1) + stage2 = 1; + uint32_t totNumStages = stage8 + stage4 + stage2; + uint32_t locNumStages = 0; + if (passId[1] == minPassId[1]) { + locNumStages = stage8 / (numPasses[passId[0]][passId[1]] - 1); + if (axis_upload_id < stage8 % (numPasses[passId[0]][passId[1]] - 1)) + locNumStages++; + axis->specializationConstants.numStages = locNumStages; + axis->specializationConstants.fftDim = pow(8, locNumStages); + axis->specializationConstants.stageRadix[0] = 8; + axis->specializationConstants.stageRadix[1] = 8; + + if (axis_upload_id == (numPasses[passId[0]][passId[1]] - 1)) { + if (stage4 == 1) { + axis->specializationConstants.numStages++; + axis->specializationConstants.stageRadix[1] = 4; + axis->specializationConstants.fftDim *= 4; + } + if (stage2 == 1) { + axis->specializationConstants.numStages++; + axis->specializationConstants.stageRadix[1] = 2; + axis->specializationConstants.fftDim *= 2; + } + } + axis->specializationConstants.stageStartSize = FFTPlan->supportAxes[axis_id-1][axis_upload_id - 1].specializationConstants.stageStartSize * FFTPlan->supportAxes[axis_id - 1][axis_upload_id - 1].specializationConstants.fftDim; + axis->specializationConstants.fft_dim_x = configuration.size[1]; + } + else { + if (axis_upload_id < numPasses[passId[0]][passId[1]] - 1) { + uint32_t locLogSize = 8 + passId[1]; + if ((axis_upload_id + 1 == numPasses[passId[0]][passId[1]] - 1) && (logSizeLaterPass - (8 + passId[1]) * (numPasses[passId[0]][passId[1]] - 2) < 3)) + locLogSize -= (3 - (logSizeLaterPass - (8 + passId[1]) * (numPasses[passId[0]][passId[1]] - 2))); + uint32_t locStage8 = locLogSize / 3; + uint32_t locStage4 = 0; + uint32_t locStage2 = 0; + if (locLogSize % 3 == 2) + locStage4 = 1; + if (locLogSize % 3 == 1) + locStage2 = 1; + axis->specializationConstants.numStages = locStage8 + locStage4 + locStage2; + axis->specializationConstants.fftDim = pow(2, locLogSize); + axis->specializationConstants.stageRadix[0] = 8; + axis->specializationConstants.stageRadix[1] = 8; + + if (locStage4 == 1) { + axis->specializationConstants.stageRadix[1] = 4; + } + if (locStage2 == 1) { + axis->specializationConstants.stageRadix[1] = 2; + } + axis->specializationConstants.stageStartSize = FFTPlan->supportAxes[axis_id - 1][axis_upload_id - 1].specializationConstants.stageStartSize * FFTPlan->supportAxes[axis_id - 1][axis_upload_id - 1].specializationConstants.fftDim; + if (configuration.performR2C) + axis->specializationConstants.fft_dim_x = configuration.size[0] / 2; + else + axis->specializationConstants.fft_dim_x = configuration.size[0]; + } + else { + uint32_t locLogSize = (logSizeLaterPass - (8 + passId[1]) * (numPasses[passId[0]][passId[1]] - 2) < 3) ? 3 : logSizeLaterPass - (8 + passId[1]) * (numPasses[passId[0]][passId[1]] - 2); + uint32_t locStage8 = locLogSize / 3; + uint32_t locStage4 = 0; + uint32_t locStage2 = 0; + if (locLogSize % 3 == 2) + locStage4 = 1; + if (locLogSize % 3 == 1) + locStage2 = 1; + axis->specializationConstants.numStages = locStage8 + locStage4 + locStage2; + axis->specializationConstants.fftDim = pow(2, locLogSize); + axis->specializationConstants.stageRadix[0] = 8; + axis->specializationConstants.stageRadix[1] = 8; + + if (locStage4 == 1) { + axis->specializationConstants.stageRadix[1] = 4; + } + if (locStage2 == 1) { + axis->specializationConstants.stageRadix[1] = 2; + } + axis->specializationConstants.stageStartSize = FFTPlan->supportAxes[axis_id - 1][axis_upload_id - 1].specializationConstants.stageStartSize * FFTPlan->supportAxes[axis_id - 1][axis_upload_id - 1].specializationConstants.fftDim; + if (configuration.performR2C) + axis->specializationConstants.fft_dim_x = configuration.size[0] / 2; + else + axis->specializationConstants.fft_dim_x = configuration.size[0]; + } + } + + + break; + } + case 4: { + uint32_t stage4 = logSize / 2; + uint32_t stage2 = 0; + if (logSize % 2 == 1) + stage2 = 1; + axis->specializationConstants.numStages = stage4 + stage2; + + + axis->specializationConstants.stageRadix[0] = 4; + axis->specializationConstants.stageRadix[1] = 4; + if (logSize % 2 == 1) + axis->specializationConstants.stageRadix[1] = 2; + break; + } + case 2: { + uint32_t stage2 = logSize; + + axis->specializationConstants.numStages = stage2; + + + axis->specializationConstants.stageRadix[0] = 2; + axis->specializationConstants.stageRadix[1] = 2; + break; + } + } + } } + else { + //configure radix stages + uint32_t logSize = log2(configuration.size[axis_id]); + uint32_t numPasses[8] = { 0,0,0,0,0,0,0,0 };//256-512-1024-2048(256KB)-4096(256KB)-8k(future?)-16k(future?) - find correct strided FFT configuration + uint32_t temp = configuration.size[axis_id]; + uint32_t startStage = 256; + uint32_t maxSingleSize = 8 * 4096 / configuration.coalescedMemory; + uint32_t maxPassId = log2(maxSingleSize / 256); + uint32_t minPassId = (maxSingleSize >= 512) ? 1 : 0; + //maxPassId += log2(configuration.registerBoost); //in development + for (uint32_t i = 0; i < 8; i++) { + while (temp > 1) + { + temp /= startStage; + numPasses[i]++; + } + temp = configuration.size[axis_id]; + startStage *= 2; + } + uint32_t passId = minPassId; + for (uint32_t i = minPassId; i < maxPassId+1; i++) { + if (numPasses[i] < numPasses[passId]) { + passId = i; + } + } + FFTPlan->numSupportAxisUploads[axis_id-1] = numPasses[passId]; + if (axis_upload_id >= numPasses[passId]) + return; + switch (configuration.radix) { + case 8: { + uint32_t stage8 = logSize / 3; + uint32_t stage4 = 0; + uint32_t stage2 = 0; + if (logSize % 3 == 2) + stage4 = 1; + if (logSize % 3 == 1) + stage2 = 1; + uint32_t totNumStages = stage8 + stage4 + stage2; + uint32_t locNumStages = 0; + if (passId == minPassId) { + locNumStages = stage8 / numPasses[passId]; + if (axis_upload_id < stage8 % numPasses[passId]) + locNumStages++; + axis->specializationConstants.numStages = locNumStages; + axis->specializationConstants.fftDim = pow(8, locNumStages); + axis->specializationConstants.stageRadix[0] = 8; + axis->specializationConstants.stageRadix[1] = 8; + + if (axis_upload_id == numPasses[passId] - 1) { + if (stage4 == 1) { + axis->specializationConstants.numStages++; + axis->specializationConstants.stageRadix[1] = 4; + axis->specializationConstants.fftDim *= 4; + } + if (stage2 == 1) { + axis->specializationConstants.numStages++; + axis->specializationConstants.stageRadix[1] = 2; + axis->specializationConstants.fftDim *= 2; + } + } + axis->specializationConstants.stageStartSize = (axis_upload_id == 0) ? 1 : FFTPlan->supportAxes[axis_id - 1][axis_upload_id - 1].specializationConstants.stageStartSize * FFTPlan->supportAxes[axis_id - 1][axis_upload_id - 1].specializationConstants.fftDim; + axis->specializationConstants.fft_dim_x = configuration.size[1]; + } + else { + if (axis_upload_id < numPasses[passId] - 1) { + + uint32_t locLogSize = 8 + passId; + if ((axis_upload_id + 1 == numPasses[passId] - 1) && (logSize - (8 + passId) * (numPasses[passId] - 1) < 3)) + locLogSize -= (3 - (logSize - (8 + passId) * (numPasses[passId] - 1))); + uint32_t locStage8 = locLogSize / 3; + uint32_t locStage4 = 0; + uint32_t locStage2 = 0; + if (locLogSize % 3 == 2) + locStage4 = 1; + if (locLogSize % 3 == 1) + locStage2 = 1; + axis->specializationConstants.numStages = locStage8 + locStage4 + locStage2; + axis->specializationConstants.fftDim = pow(2, locLogSize); + axis->specializationConstants.stageRadix[0] = 8; + axis->specializationConstants.stageRadix[1] = 8; + + if (locStage4 == 1) { + axis->specializationConstants.stageRadix[1] = 4; + } + if (locStage2 == 1) { + axis->specializationConstants.stageRadix[1] = 2; + } + axis->specializationConstants.stageStartSize = (axis_upload_id == 0) ? 1 : FFTPlan->axes[axis_id][axis_upload_id - 1].specializationConstants.stageStartSize * FFTPlan->axes[axis_id][axis_upload_id - 1].specializationConstants.fftDim; + if (configuration.performR2C) + axis->specializationConstants.fft_dim_x = configuration.size[0] / 2; + else + axis->specializationConstants.fft_dim_x = configuration.size[0]; + } + else { + uint32_t locLogSize = (logSize - (8 + passId) * (numPasses[passId] - 1) < 3) ? 3 : logSize - (8 + passId) * (numPasses[passId] - 1); + uint32_t locStage8 = locLogSize / 3; + uint32_t locStage4 = 0; + uint32_t locStage2 = 0; + if (locLogSize % 3 == 2) + locStage4 = 1; + if (locLogSize % 3 == 1) + locStage2 = 1; + axis->specializationConstants.numStages = locStage8 + locStage4 + locStage2; + axis->specializationConstants.fftDim = pow(2, locLogSize); + axis->specializationConstants.stageRadix[0] = 8; + axis->specializationConstants.stageRadix[1] = 8; + + if (locStage4 == 1) { + axis->specializationConstants.stageRadix[1] = 4; + } + if (locStage2 == 1) { + axis->specializationConstants.stageRadix[1] = 2; + } + axis->specializationConstants.stageStartSize = (axis_upload_id == 0) ? 1 : FFTPlan->axes[axis_id][axis_upload_id - 1].specializationConstants.stageStartSize * FFTPlan->axes[axis_id][axis_upload_id - 1].specializationConstants.fftDim; + if (configuration.performR2C) + axis->specializationConstants.fft_dim_x = configuration.size[0] / 2; + else + axis->specializationConstants.fft_dim_x = configuration.size[0]; + } + } + + + break; + } + case 4: { + uint32_t stage4 = logSize / 2; + uint32_t stage2 = 0; + if (logSize % 2 == 1) + stage2 = 1; + axis->specializationConstants.numStages = stage4 + stage2; + + axis->specializationConstants.stageRadix[0] = 4; + axis->specializationConstants.stageRadix[1] = 4; + if (logSize % 2 == 1) + axis->specializationConstants.stageRadix[1] = 2; + break; + } + case 2: { + uint32_t stage2 = logSize; + + axis->specializationConstants.numStages = stage2; + + + axis->specializationConstants.stageRadix[0] = 2; + axis->specializationConstants.stageRadix[1] = 2; + break; + } + } + } + axis->specializationConstants.passID = FFTPlan->numSupportAxisUploads[axis_id - 1] - 1 - axis_upload_id; + axis->specializationConstants.fft_dim_full = configuration.size[axis_id]; + axis->groupedBatch = (4096 / axis->specializationConstants.fftDim >= configuration.coalescedMemory / 8) ? 4096 / axis->specializationConstants.fftDim : configuration.coalescedMemory / 8; + //axis->groupedBatch = ((axis_upload_id>0)&&(axis->groupedBatch > axis->specializationConstants.stageStartSize)) ? axis->specializationConstants.stageStartSize : axis->groupedBatch; //configure strides //perform r2c axis->specializationConstants.inputStride[0] = 1; @@ -1394,11 +2074,6 @@ typedef struct VkFFTApplication { axis->specializationConstants.inputStride[4] = axis->specializationConstants.inputStride[3] * configuration.coordinateFeatures; axis->specializationConstants.outputStride[4] = axis->specializationConstants.outputStride[3] * configuration.coordinateFeatures; - for (uint32_t i = 0; i < 3; ++i) { - axis->specializationConstants.radixStride[i] = configuration.size[axis_id] / pow(2, i + 1); - - } - axis->specializationConstants.inverse = inverse; axis->specializationConstants.zeropad[0] = configuration.performZeropadding[axis_id]; axis->specializationConstants.zeropad[1] = false; @@ -1411,9 +2086,9 @@ typedef struct VkFFTApplication { VkDescriptorPoolSize descriptorPoolSize = { VK_DESCRIPTOR_TYPE_STORAGE_BUFFER }; descriptorPoolSize.descriptorCount = 2; - if ((axis_id == 1) && (configuration.FFTdim == 2) && (configuration.performConvolution)) + if ((axis_id == 1) && (axis_upload_id == 0) && (configuration.FFTdim == 2) && (configuration.performConvolution)) descriptorPoolSize.descriptorCount = 3; - if ((axis_id == 2) && (configuration.FFTdim == 3) && (configuration.performConvolution)) + if ((axis_id == 2) && (axis_upload_id == 0) && (configuration.FFTdim == 3) && (configuration.performConvolution)) descriptorPoolSize.descriptorCount = 3; VkDescriptorPoolCreateInfo descriptorPoolCreateInfo = { VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO }; @@ -1447,7 +2122,9 @@ typedef struct VkFFTApplication { VkDescriptorBufferInfo descriptorBufferInfo = {}; if (i == 0) { - if (configuration.isInputFormatted && ( + descriptorBufferInfo.buffer = configuration.buffer[0]; + descriptorBufferInfo.range = configuration.bufferSize[0]; + /*if (configuration.isInputFormatted && ( ((axis_id == 0) && (!inverse)) || ((axis_id == configuration.FFTdim-1) && (inverse))) ) { @@ -1463,11 +2140,13 @@ typedef struct VkFFTApplication { descriptorBufferInfo.buffer = configuration.buffer[0]; descriptorBufferInfo.range = configuration.bufferSize[0]; } - } + }*/ descriptorBufferInfo.offset = 0; } if (i == 1) { - if ((configuration.isOutputFormatted && ( + descriptorBufferInfo.buffer = configuration.buffer[0]; + descriptorBufferInfo.range = configuration.bufferSize[0]; + /*if ((configuration.isOutputFormatted && ( ((axis_id == 0) && (inverse)) || ((axis_id == configuration.FFTdim-1) && (!inverse) && (!configuration.performConvolution)) || ((axis_id == 0) && (configuration.performConvolution) && (configuration.FFTdim == 1))) @@ -1482,7 +2161,7 @@ typedef struct VkFFTApplication { else { descriptorBufferInfo.buffer = configuration.buffer[0]; descriptorBufferInfo.range = configuration.bufferSize[0]; - } + }*/ descriptorBufferInfo.offset = 0; } if (i == 2) { @@ -1516,33 +2195,50 @@ typedef struct VkFFTApplication { vkCreatePipelineLayout(configuration.device[0], &pipelineLayoutCreateInfo, NULL, &axis->pipelineLayout); if (axis_id == 1) { - FFTPlan->supportAxes[0].axisBlock[0] = (configuration.size[axis_id] / 8 > 1) ? configuration.size[axis_id] / 8 : 1; - FFTPlan->supportAxes[0].axisBlock[1] = 1; - FFTPlan->supportAxes[0].axisBlock[2] = 1; - FFTPlan->supportAxes[0].axisBlock[3] = configuration.size[1]; + if (axis_upload_id == 0) { + axis->axisBlock[0] = (axis->specializationConstants.fftDim / 8 > 1) ? axis->specializationConstants.fftDim / 8 : 1; + if (axis->axisBlock[0] > 512) axis->axisBlock[0] = 512; + axis->axisBlock[1] = 1; + axis->axisBlock[2] = 1; + axis->axisBlock[3] = axis->specializationConstants.fftDim; + } + else { + axis->axisBlock[1] = (axis->specializationConstants.fftDim / 8 > 1) ? axis->specializationConstants.fftDim / 8 : 1; + + axis->axisBlock[0] = (axis->specializationConstants.stageStartSize > axis->groupedBatch) ? axis->groupedBatch : axis->specializationConstants.stageStartSize; + + axis->axisBlock[2] = 1; + axis->axisBlock[3] = axis->specializationConstants.fftDim; + } } if (axis_id == 2) { - FFTPlan->supportAxes[1].axisBlock[0] = (configuration.size[1] > FFTPlan->supportAxes[1].groupedBatch) ? FFTPlan->supportAxes[1].groupedBatch : configuration.size[1]; - FFTPlan->supportAxes[1].axisBlock[1] = (configuration.size[2] / 8 > 1) ? configuration.size[2] / 8 : 1; - FFTPlan->supportAxes[1].axisBlock[2] = 1; - FFTPlan->supportAxes[1].axisBlock[3] = configuration.size[2]; + axis->axisBlock[1] = (axis->specializationConstants.fftDim / 8 > 1) ? axis->specializationConstants.fftDim / 8 : 1; + + axis->axisBlock[0] = (configuration.size[1] > axis->groupedBatch) ? axis->groupedBatch : configuration.size[1]; + /*if (axis->axisBlock[0] * axis->axisBlock[1] < 64) + if (configuration.size[1] > 64 / axis->axisBlock[1]) + axis->axisBlock[0] = 64 / axis->axisBlock[1]; + else + axis->axisBlock[0] = configuration.size[0];*/ + axis->axisBlock[2] = 1; + axis->axisBlock[3] = axis->specializationConstants.fftDim; } - VkSpecializationMapEntry specializationMapEntries[29] = { {} }; - for (uint32_t i = 0; i < 29; i++) { + VkSpecializationMapEntry specializationMapEntries[30] = { {} }; + for (uint32_t i = 0; i < 30; i++) { specializationMapEntries[i].constantID = i + 1; specializationMapEntries[i].size = sizeof(uint32_t); specializationMapEntries[i].offset = i * sizeof(uint32_t); } VkSpecializationInfo specializationInfo = {}; - specializationInfo.dataSize = 29 * sizeof(uint32_t); - specializationInfo.mapEntryCount = 29; + specializationInfo.dataSize = 30 * sizeof(uint32_t); + specializationInfo.mapEntryCount = 30; specializationInfo.pMapEntries = specializationMapEntries; - FFTPlan->supportAxes[axis_id-1].specializationConstants.localSize[0] = FFTPlan->supportAxes[axis_id-1].axisBlock[0]; - FFTPlan->supportAxes[axis_id-1].specializationConstants.localSize[1] = FFTPlan->supportAxes[axis_id-1].axisBlock[1]; - FFTPlan->supportAxes[axis_id-1].specializationConstants.localSize[2] = FFTPlan->supportAxes[axis_id-1].axisBlock[2]; - FFTPlan->supportAxes[axis_id-1].specializationConstants.fftDim = FFTPlan->supportAxes[axis_id-1].axisBlock[3]; - specializationInfo.pData = &FFTPlan->supportAxes[axis_id-1].specializationConstants; + axis->specializationConstants.localSize[0] = axis->axisBlock[0]; + axis->specializationConstants.localSize[1] = axis->axisBlock[1]; + axis->specializationConstants.localSize[2] = axis->axisBlock[2]; + axis->specializationConstants.fftDim = axis->axisBlock[3]; + specializationInfo.pData = &axis->specializationConstants; VkPipelineShaderStageCreateInfo pipelineShaderStageCreateInfo = { VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO }; VkComputePipelineCreateInfo computePipelineCreateInfo = { VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO }; @@ -1552,52 +2248,119 @@ typedef struct VkFFTApplication { if (axis_id == 1) { - if ((configuration.FFTdim == 2) && (configuration.performConvolution)) { - switch (configuration.matrixConvolution) { + if ((configuration.FFTdim == 2) && (configuration.performConvolution) && (axis_upload_id == 0)) { + if (axis_upload_id == 0) { + switch (configuration.matrixConvolution) { + case 1: + VkFFTInitShader(9, &pipelineShaderStageCreateInfo.module); + break; + case 2: + if (configuration.symmetricKernel) + VkFFTInitShader(12, &pipelineShaderStageCreateInfo.module); + else + VkFFTInitShader(15, &pipelineShaderStageCreateInfo.module); + break; + case 3: + if (configuration.symmetricKernel) + VkFFTInitShader(18, &pipelineShaderStageCreateInfo.module); + else + VkFFTInitShader(21, &pipelineShaderStageCreateInfo.module); + break; + } + } + else { + switch (configuration.matrixConvolution) { + case 1: + VkFFTInitShader(10, &pipelineShaderStageCreateInfo.module); + break; + case 2: + if (configuration.symmetricKernel) + VkFFTInitShader(13, &pipelineShaderStageCreateInfo.module); + else + VkFFTInitShader(16, &pipelineShaderStageCreateInfo.module); + break; + case 3: + if (configuration.symmetricKernel) + VkFFTInitShader(19, &pipelineShaderStageCreateInfo.module); + else + VkFFTInitShader(22, &pipelineShaderStageCreateInfo.module); + break; + } + } + + } + else { + /*if (axis_upload_id == 0) + VkFFTInitShader(0, &pipelineShaderStageCreateInfo.module); + else + VkFFTInitShader(2, &pipelineShaderStageCreateInfo.module);*/ + switch (configuration.registerBoost) { case 1: - VkFFTInitShader(9, &pipelineShaderStageCreateInfo.module); + { + if (axis_upload_id == 0) + VkFFTInitShader(0, &pipelineShaderStageCreateInfo.module); + else + VkFFTInitShader(2, &pipelineShaderStageCreateInfo.module); break; + } case 2: - if (configuration.symmetricKernel) - VkFFTInitShader(12, &pipelineShaderStageCreateInfo.module); - else - VkFFTInitShader(15, &pipelineShaderStageCreateInfo.module); + { + switch (axis->specializationConstants.fftDim) { + case 8192: + VkFFTInitShader(25, &pipelineShaderStageCreateInfo.module); + break; + default: + if (axis_upload_id == 0) + VkFFTInitShader(0, &pipelineShaderStageCreateInfo.module); + else + VkFFTInitShader(2, &pipelineShaderStageCreateInfo.module); + break; + } break; - case 3: - if (configuration.symmetricKernel) - VkFFTInitShader(18, &pipelineShaderStageCreateInfo.module); - else - VkFFTInitShader(21, &pipelineShaderStageCreateInfo.module); + } + case 4: + { + switch (axis->specializationConstants.fftDim){ + case 8192: + VkFFTInitShader(25, &pipelineShaderStageCreateInfo.module); + break; + case 16384: + VkFFTInitShader(35, &pipelineShaderStageCreateInfo.module); + break; + default: + if (axis_upload_id == 0) + VkFFTInitShader(0, &pipelineShaderStageCreateInfo.module); + else + VkFFTInitShader(2, &pipelineShaderStageCreateInfo.module); + break; + } break; } - - } - else { - - VkFFTInitShader(0, &pipelineShaderStageCreateInfo.module); + } } } if (axis_id == 2) { - if ((configuration.FFTdim == 3) && (configuration.performConvolution)) { - switch (configuration.matrixConvolution) { - case 1: - VkFFTInitShader(8, &pipelineShaderStageCreateInfo.module); - break; - case 2: - if (configuration.symmetricKernel) - VkFFTInitShader(11, &pipelineShaderStageCreateInfo.module); - else - VkFFTInitShader(14, &pipelineShaderStageCreateInfo.module); - break; - case 3: - if (configuration.symmetricKernel) - VkFFTInitShader(17, &pipelineShaderStageCreateInfo.module); - else - VkFFTInitShader(20, &pipelineShaderStageCreateInfo.module); - break; - } + if ((configuration.FFTdim == 3) && (configuration.performConvolution) && (axis_upload_id == 0)) { + switch (configuration.matrixConvolution) { + case 1: + VkFFTInitShader(8, &pipelineShaderStageCreateInfo.module); + break; + case 2: + if (configuration.symmetricKernel) + VkFFTInitShader(11, &pipelineShaderStageCreateInfo.module); + else + VkFFTInitShader(14, &pipelineShaderStageCreateInfo.module); + break; + case 3: + if (configuration.symmetricKernel) + VkFFTInitShader(17, &pipelineShaderStageCreateInfo.module); + else + VkFFTInitShader(20, &pipelineShaderStageCreateInfo.module); + break; + } + } else { VkFFTInitShader(7, &pipelineShaderStageCreateInfo.module); @@ -1750,8 +2513,8 @@ typedef struct VkFFTApplication { VkComputePipelineCreateInfo computePipelineCreateInfo = { VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO }; uint32_t max_dim = 1; - if (FFTPlan->axes[axis_id].axisBlock[1] * configuration.size[axis_id] < pow(2, floor(log2(sqrt(1024 * FFTPlan->transpose[axis_id].specializationConstants.ratio))))) - max_dim = FFTPlan->axes[axis_id].axisBlock[1] * configuration.size[axis_id]; + if (FFTPlan->axes[axis_id][0].axisBlock[1] * configuration.size[axis_id] < pow(2, floor(log2(sqrt(1024 * FFTPlan->transpose[axis_id].specializationConstants.ratio))))) + max_dim = FFTPlan->axes[axis_id][0].axisBlock[1] * configuration.size[axis_id]; else max_dim = pow(2, floor(log2(sqrt(1024 * FFTPlan->transpose[axis_id].specializationConstants.ratio)))); FFTPlan->transpose[axis_id].transposeBlock[0] = max_dim; @@ -1819,13 +2582,17 @@ typedef struct VkFFTApplication { if (configuration.matrixConvolution > 1) configuration.coordinateFeatures = configuration.matrixConvolution; if (configuration.performConvolution) { + configuration.inverse = false; for (uint32_t i = 0; i < configuration.FFTdim; i++) { - VkFFTPlanAxis(&localFFTPlan_inverse_convolution, i, true); + for (uint32_t j =0; j<8; j++) + VkFFTPlanAxis(&localFFTPlan_inverse_convolution, i, j, true); } + } for (uint32_t i = 0; i < configuration.FFTdim; i++) { - VkFFTPlanAxis(&localFFTPlan, i, configuration.inverse); + for (uint32_t j = 0; j < 8; j++) + VkFFTPlanAxis(&localFFTPlan, i, j, configuration.inverse); } } @@ -1839,50 +2606,85 @@ typedef struct VkFFTApplication { if (!configuration.inverse) { //FFT axis 0 for (uint32_t j = 0; j < configuration.numberBatches; j++) { - localFFTPlan.axes[0].pushConstants.batch = j; - uint32_t maxCoordinate = ((configuration.matrixConvolution) > 1 && (configuration.performConvolution) && (configuration.FFTdim == 1)) ? 1 : configuration.coordinateFeatures; - for (uint32_t i = 0; i < maxCoordinate; i++) { - localFFTPlan.axes[0].pushConstants.coordinate = i; - vkCmdPushConstants(commandBuffer, localFFTPlan.axes[0].pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &localFFTPlan.axes[0].pushConstants); - vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.axes[0].pipeline); - vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.axes[0].pipelineLayout, 0, 1, &localFFTPlan.axes[0].descriptorSet, 0, NULL); - if (configuration.performZeropadding[1]) { - if (configuration.performZeropadding[2]) { + for (int l = localFFTPlan.numAxisUploads[0]-1; l >=0; l--) { + VkFFTAxis* axis = &localFFTPlan.axes[0][l]; + axis->pushConstants.batch = j; + uint32_t maxCoordinate = ((configuration.matrixConvolution) > 1 && (configuration.performConvolution) && (configuration.FFTdim == 1)) ? 1 : configuration.coordinateFeatures; + for (uint32_t i = 0; i < maxCoordinate; i++) { + axis->pushConstants.coordinate = i; + vkCmdPushConstants(commandBuffer, axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &axis->pushConstants); + vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL); + if (l == 0) { + if (configuration.performZeropadding[1]) { + if (configuration.performZeropadding[2]) { - if (configuration.performR2C == true) - vkCmdDispatch(commandBuffer, 1, ceil(configuration.size[1] / 2.0 / 2.0 / localFFTPlan.axes[0].axisBlock[1]), ceil(configuration.size[2] / 2.0 / localFFTPlan.axes[0].axisBlock[2])); - else - vkCmdDispatch(commandBuffer, 1, ceil(configuration.size[1] / 2.0 / localFFTPlan.axes[0].axisBlock[1]), ceil(configuration.size[2] / 2.0 / localFFTPlan.axes[0].axisBlock[2])); - } - else { - if (configuration.performR2C == true) - vkCmdDispatch(commandBuffer, 1, ceil(configuration.size[1] / 2.0 / 2.0 / localFFTPlan.axes[0].axisBlock[1]), configuration.size[2] / localFFTPlan.axes[0].axisBlock[2]); - else - vkCmdDispatch(commandBuffer, 1, ceil(configuration.size[1] / 2.0 / localFFTPlan.axes[0].axisBlock[1]), configuration.size[2] / localFFTPlan.axes[0].axisBlock[2]); - } - } - else { - if (configuration.performZeropadding[2]) { - if (configuration.performR2C == true) - vkCmdDispatch(commandBuffer, 1, configuration.size[1] / 2 / localFFTPlan.axes[0].axisBlock[1], ceil(configuration.size[2] / 2.0 / localFFTPlan.axes[0].axisBlock[2])); - else - vkCmdDispatch(commandBuffer, 1, configuration.size[1] / localFFTPlan.axes[0].axisBlock[1], ceil(configuration.size[2] / 2.0 / localFFTPlan.axes[0].axisBlock[2])); + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim, ceil(configuration.size[1] / 2.0 / 2.0), ceil(configuration.size[2] / 2.0)); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim, ceil(configuration.size[1] / 2.0), ceil(configuration.size[2] / 2.0)); + } + else { + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim, ceil(configuration.size[1] / 2.0 / 2.0), configuration.size[2]); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim, ceil(configuration.size[1] / 2.0) , configuration.size[2]); + } + } + else { + if (configuration.performZeropadding[2]) { + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim, ceil(configuration.size[1] / 2.0) , ceil(configuration.size[2] / 2.0)); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim, configuration.size[1] , ceil(configuration.size[2] / 2.0)); + } + else { + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim, ceil(configuration.size[1] / 2.0) , configuration.size[2]); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim, configuration.size[1], configuration.size[2]); + } + } } else { - if (configuration.performR2C == true) - vkCmdDispatch(commandBuffer, 1, configuration.size[1] / 2 / localFFTPlan.axes[0].axisBlock[1], configuration.size[2] / localFFTPlan.axes[0].axisBlock[2]); - else - vkCmdDispatch(commandBuffer, 1, configuration.size[1] / localFFTPlan.axes[0].axisBlock[1], configuration.size[2] / localFFTPlan.axes[0].axisBlock[2]); + if (configuration.performZeropadding[1]) { + if (configuration.performZeropadding[2]) { + + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim / axis->axisBlock[0], ceil(configuration.size[1] / 2.0 / 2.0), ceil(configuration.size[2] / 2.0)); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim / axis->axisBlock[0], ceil(configuration.size[1] / 2.0), ceil(configuration.size[2] / 2.0)); + } + else { + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim / axis->axisBlock[0], ceil(configuration.size[1] / 2.0 / 2.0), configuration.size[2] ); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim / axis->axisBlock[0], ceil(configuration.size[1] / 2.0), configuration.size[2]); + } + } + else { + if (configuration.performZeropadding[2]) { + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim / axis->axisBlock[0], ceil(configuration.size[1] / 2.0) , ceil(configuration.size[2] / 2.0)); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim / axis->axisBlock[0], configuration.size[1] , ceil(configuration.size[2] / 2.0)); + } + else { + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim / axis->axisBlock[0], ceil(configuration.size[1] / 2.0) , configuration.size[2] ); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim / axis->axisBlock[0], configuration.size[1] , configuration.size[2]); + } + } } } - + vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); } } - vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); - + if (configuration.FFTdim > 1) { //transpose 0-1, if needed - if (configuration.performTranspose[0]) { + /*if (configuration.performTranspose[0]) { for (uint32_t j = 0; j < configuration.numberBatches; j++) { localFFTPlan.transpose[0].pushConstants.batch = j; for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { @@ -1908,11 +2710,11 @@ typedef struct VkFFTApplication { } vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); - } + }*/ //FFT axis 1 if ((configuration.FFTdim == 2) && (configuration.performConvolution)) { - if (configuration.performTranspose[0]) { + /*if (configuration.performTranspose[0]) { uint32_t maxCoordinate = (configuration.matrixConvolution > 1 ) ? 1 : configuration.coordinateFeatures; for (uint32_t i = 0; i < maxCoordinate; i++) { localFFTPlan.axes[1].pushConstants.coordinate = i; @@ -1937,50 +2739,75 @@ typedef struct VkFFTApplication { vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); } - else { - if (configuration.performR2C == true) { - uint32_t maxCoordinate = (configuration.matrixConvolution > 1) ? 1 : configuration.coordinateFeatures; + else {*/ + if (configuration.performR2C == true) { + for (int l = localFFTPlan.numSupportAxisUploads[0]-1; l >=0; l--) { + VkFFTAxis* axis = &localFFTPlan.supportAxes[0][l]; + uint32_t maxCoordinate = ((configuration.matrixConvolution > 1)&&(l == 0)) ? 1 : configuration.coordinateFeatures; for (uint32_t i = 0; i < maxCoordinate; i++) { - localFFTPlan.supportAxes[0].pushConstants.coordinate = i; - localFFTPlan.supportAxes[0].pushConstants.batch = configuration.numberKernels; - vkCmdPushConstants(commandBuffer, localFFTPlan.supportAxes[0].pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &localFFTPlan.supportAxes[0].pushConstants); - vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.supportAxes[0].pipeline); - vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.supportAxes[0].pipelineLayout, 0, 1, &localFFTPlan.supportAxes[0].descriptorSet, 0, NULL); - if (configuration.performZeropadding[2]) { - vkCmdDispatch(commandBuffer, 1, 1, ceil(configuration.size[2] / 2.0)); + axis->pushConstants.coordinate = i; + + axis->pushConstants.batch = ((l == 0)&& (configuration.matrixConvolution == 1)) ? configuration.numberKernels : 0; + + vkCmdPushConstants(commandBuffer, axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &axis->pushConstants); + vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL); + if (l == 0) { + if (configuration.performZeropadding[2]) { + vkCmdDispatch(commandBuffer, configuration.size[1] / axis->specializationConstants.fftDim, 1, ceil(configuration.size[2] / 2.0)); + } + else { + vkCmdDispatch(commandBuffer, configuration.size[1] / axis->specializationConstants.fftDim, 1, configuration.size[2]); + } } - else { - vkCmdDispatch(commandBuffer, 1, 1, configuration.size[2]); + else{ + if (configuration.performZeropadding[2]) { + vkCmdDispatch(commandBuffer, configuration.size[1] / axis->specializationConstants.fftDim / axis->axisBlock[0], 1, ceil(configuration.size[2] / 2.0)); + } + else { + vkCmdDispatch(commandBuffer, configuration.size[1] / axis->specializationConstants.fftDim / axis->axisBlock[0], 1, configuration.size[2]); + } } } + if (l >0) + vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); + } - uint32_t maxCoordinate = (configuration.matrixConvolution > 1) ? 1 : configuration.coordinateFeatures; + + } + + for (int l = localFFTPlan.numAxisUploads[1]-1; l >=0; l--) { + VkFFTAxis* axis = &localFFTPlan.axes[1][l]; + uint32_t maxCoordinate = ((configuration.matrixConvolution > 1) && (l == 0)) ? 1 : configuration.coordinateFeatures; for (uint32_t i = 0; i < maxCoordinate; i++) { - localFFTPlan.axes[1].pushConstants.coordinate = i; - localFFTPlan.axes[1].pushConstants.batch = configuration.numberKernels; - vkCmdPushConstants(commandBuffer, localFFTPlan.axes[1].pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &localFFTPlan.axes[1].pushConstants); - vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.axes[1].pipeline); - vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.axes[1].pipelineLayout, 0, 1, &localFFTPlan.axes[1].descriptorSet, 0, NULL); + + axis->pushConstants.coordinate = i; + axis->pushConstants.batch = ((l == 0) && (configuration.matrixConvolution == 1)) ? configuration.numberKernels : 0; + vkCmdPushConstants(commandBuffer, axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &axis->pushConstants); + vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL); if (configuration.performZeropadding[2]) { if (configuration.performR2C == true) - vkCmdDispatch(commandBuffer, configuration.size[0] / 2 / localFFTPlan.axes[1].axisBlock[0], 1, ceil(configuration.size[2] / 2.0 / localFFTPlan.axes[1].axisBlock[2])); + vkCmdDispatch(commandBuffer, ceil(configuration.size[0] / 2.0) / axis->axisBlock[0]* configuration.size[1] / axis->specializationConstants.fftDim, 1, ceil(configuration.size[2] / 2.0)); else - vkCmdDispatch(commandBuffer, configuration.size[0] / localFFTPlan.axes[1].axisBlock[0], 1, ceil(configuration.size[2] / 2.0 / localFFTPlan.axes[1].axisBlock[2])); + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->axisBlock[0]* configuration.size[1] / axis->specializationConstants.fftDim, 1, ceil(configuration.size[2] / 2.0)); } else { if (configuration.performR2C == true) - vkCmdDispatch(commandBuffer, configuration.size[0] / 2 / localFFTPlan.axes[1].axisBlock[0], 1, configuration.size[2] / localFFTPlan.axes[1].axisBlock[2]); + vkCmdDispatch(commandBuffer, ceil(configuration.size[0] / 2.0) / axis->axisBlock[0]* configuration.size[1] / axis->specializationConstants.fftDim, 1, configuration.size[2] ); else - vkCmdDispatch(commandBuffer, configuration.size[0] / localFFTPlan.axes[1].axisBlock[0], 1, configuration.size[2] / localFFTPlan.axes[1].axisBlock[2]); + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->axisBlock[0]* configuration.size[1] / axis->specializationConstants.fftDim, 1, configuration.size[2] ); } } vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); } + + //} } else { - if (configuration.performTranspose[0]) { + /*if (configuration.performTranspose[0]) { for (uint32_t j = 0; j < configuration.numberBatches; j++) { localFFTPlan.axes[1].pushConstants.batch = j; for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { @@ -2008,55 +2835,75 @@ typedef struct VkFFTApplication { } else { - - if (configuration.performR2C == true) { - for (uint32_t j = 0; j < configuration.numberBatches; j++) { - localFFTPlan.supportAxes[0].pushConstants.batch = j; + */ + if (configuration.performR2C == true) { + for (uint32_t j = 0; j < configuration.numberBatches; j++) { + for (int l = localFFTPlan.numSupportAxisUploads[0]-1; l >=0; l--) { + VkFFTAxis* axis = &localFFTPlan.supportAxes[0][l]; + axis->pushConstants.batch = j; for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { - localFFTPlan.supportAxes[0].pushConstants.coordinate = i; - vkCmdPushConstants(commandBuffer, localFFTPlan.supportAxes[0].pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &localFFTPlan.supportAxes[0].pushConstants); - vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.supportAxes[0].pipeline); - vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.supportAxes[0].pipelineLayout, 0, 1, &localFFTPlan.supportAxes[0].descriptorSet, 0, NULL); - if (configuration.performZeropadding[2]) { - vkCmdDispatch(commandBuffer, 1, 1, ceil(configuration.size[2] / 2.0)); + axis->pushConstants.coordinate = i; + vkCmdPushConstants(commandBuffer, axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &axis->pushConstants); + vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL); + if (l == 0) { + if (configuration.performZeropadding[2]) { + vkCmdDispatch(commandBuffer, configuration.size[1] / axis->specializationConstants.fftDim, 1, ceil (configuration.size[2] / 2.0)); + } + else { + vkCmdDispatch(commandBuffer, configuration.size[1] / axis->specializationConstants.fftDim, 1, configuration.size[2]); + } } else { - vkCmdDispatch(commandBuffer, 1, 1, configuration.size[2]); + if (configuration.performZeropadding[2]) { + vkCmdDispatch(commandBuffer, configuration.size[1] / axis->specializationConstants.fftDim / axis->axisBlock[0], 1, ceil(configuration.size[2] / 2.0)); + } + else { + vkCmdDispatch(commandBuffer, configuration.size[1] / axis->specializationConstants.fftDim / axis->axisBlock[0], 1, configuration.size[2]); + } } } + if (l >=0) + vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); + } } - for (uint32_t j = 0; j < configuration.numberBatches; j++) { - localFFTPlan.axes[1].pushConstants.batch = j; + } + for (uint32_t j = 0; j < configuration.numberBatches; j++) { + for (int l = localFFTPlan.numAxisUploads[1]-1; l >=0; l--) { + VkFFTAxis* axis = &localFFTPlan.axes[1][l]; + axis->pushConstants.batch = j; for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { - localFFTPlan.axes[1].pushConstants.coordinate = i; - vkCmdPushConstants(commandBuffer, localFFTPlan.axes[1].pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &localFFTPlan.axes[1].pushConstants); - vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.axes[1].pipeline); - vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.axes[1].pipelineLayout, 0, 1, &localFFTPlan.axes[1].descriptorSet, 0, NULL); + axis->pushConstants.coordinate = i; + vkCmdPushConstants(commandBuffer, axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &axis->pushConstants); + vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL); if (configuration.performZeropadding[2]) { if (configuration.performR2C == true) - vkCmdDispatch(commandBuffer, configuration.size[0] / 2 / localFFTPlan.axes[1].axisBlock[0], 1, ceil(configuration.size[2] / 2.0 / localFFTPlan.axes[1].axisBlock[2])); + vkCmdDispatch(commandBuffer, ceil(configuration.size[0] / 2.0) / axis->axisBlock[0] * configuration.size[1] / axis->specializationConstants.fftDim, 1, ceil(configuration.size[2] / 2.0)); else - vkCmdDispatch(commandBuffer, configuration.size[0] / localFFTPlan.axes[1].axisBlock[0], 1, ceil(configuration.size[2] / 2.0 / localFFTPlan.axes[1].axisBlock[2])); + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->axisBlock[0] * configuration.size[1] / axis->specializationConstants.fftDim, 1, ceil(configuration.size[2] / 2.0)); } else { if (configuration.performR2C == true) - vkCmdDispatch(commandBuffer, configuration.size[0] / 2 / localFFTPlan.axes[1].axisBlock[0], 1, configuration.size[2] / localFFTPlan.axes[1].axisBlock[2]); + vkCmdDispatch(commandBuffer, ceil(configuration.size[0] / 2.0) / axis->axisBlock[0] * configuration.size[1] / axis->specializationConstants.fftDim, 1, configuration.size[2]); else - vkCmdDispatch(commandBuffer, configuration.size[0] / localFFTPlan.axes[1].axisBlock[0], 1, configuration.size[2] / localFFTPlan.axes[1].axisBlock[2]); + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->axisBlock[0] * configuration.size[1] / axis->specializationConstants.fftDim, 1, configuration.size[2]); } } - } - vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); + vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); + } } + + //} } } //FFT axis 2 if (configuration.FFTdim > 2) { //transpose 1-2, after 0-1 - if (configuration.performTranspose[1]) { + /*if (configuration.performTranspose[1]) { for (uint32_t j = 0; j < configuration.numberBatches; j++) { localFFTPlan.transpose[1].pushConstants.batch = j; for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { @@ -2082,11 +2929,11 @@ typedef struct VkFFTApplication { } vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); - } + }*/ if ((configuration.FFTdim == 3) && (configuration.performConvolution)) { //transposed 1-2, transposed 0-1 - if (configuration.performTranspose[1]) { + /*if (configuration.performTranspose[1]) { uint32_t maxCoordinate = (configuration.matrixConvolution > 1) ? 1 : configuration.coordinateFeatures; for (uint32_t i = 0; i < maxCoordinate; i++) { localFFTPlan.axes[2].pushConstants.coordinate = i; @@ -2103,55 +2950,71 @@ typedef struct VkFFTApplication { } else { - if (configuration.performTranspose[0]) { - //transposed 0-1, didn't transpose 1-2 - uint32_t maxCoordinate = (configuration.matrixConvolution > 1) ? 1 : configuration.coordinateFeatures; - for (uint32_t i = 0; i < maxCoordinate; i++) { - localFFTPlan.axes[2].pushConstants.coordinate = i; - localFFTPlan.axes[2].pushConstants.batch = configuration.numberKernels; - vkCmdPushConstants(commandBuffer, localFFTPlan.axes[2].pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &localFFTPlan.axes[2].pushConstants); - vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.axes[2].pipeline); - vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.axes[2].pipelineLayout, 0, 1, &localFFTPlan.axes[2].descriptorSet, 0, NULL); - if (configuration.performR2C == true) - vkCmdDispatch(commandBuffer, configuration.size[1] / localFFTPlan.axes[2].axisBlock[0], 1, configuration.size[0] / 2 + 1); - else - vkCmdDispatch(commandBuffer, configuration.size[1] / localFFTPlan.axes[2].axisBlock[0], 1, configuration.size[0]); - } - vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); + if (configuration.performTranspose[0]) { + //transposed 0-1, didn't transpose 1-2 + uint32_t maxCoordinate = (configuration.matrixConvolution > 1) ? 1 : configuration.coordinateFeatures; + for (uint32_t i = 0; i < maxCoordinate; i++) { + localFFTPlan.axes[2].pushConstants.coordinate = i; + localFFTPlan.axes[2].pushConstants.batch = configuration.numberKernels; + vkCmdPushConstants(commandBuffer, localFFTPlan.axes[2].pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &localFFTPlan.axes[2].pushConstants); + vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.axes[2].pipeline); + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.axes[2].pipelineLayout, 0, 1, &localFFTPlan.axes[2].descriptorSet, 0, NULL); + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, configuration.size[1] / localFFTPlan.axes[2].axisBlock[0], 1, configuration.size[0] / 2 + 1); + else + vkCmdDispatch(commandBuffer, configuration.size[1] / localFFTPlan.axes[2].axisBlock[0], 1, configuration.size[0]); } - else { - //didn't transpose 0-1, didn't transpose 1-2 - if (configuration.performR2C == true) { - uint32_t maxCoordinate = (configuration.matrixConvolution > 1) ? 1 : configuration.coordinateFeatures; - for (uint32_t i = 0; i < maxCoordinate; i++) { - localFFTPlan.supportAxes[1].pushConstants.coordinate = i; - localFFTPlan.supportAxes[1].pushConstants.batch = configuration.numberKernels; - vkCmdPushConstants(commandBuffer, localFFTPlan.supportAxes[1].pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &localFFTPlan.supportAxes[1].pushConstants); - vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.supportAxes[1].pipeline); - vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.supportAxes[1].pipelineLayout, 0, 1, &localFFTPlan.supportAxes[1].descriptorSet, 0, NULL); - vkCmdDispatch(commandBuffer, configuration.size[1] / localFFTPlan.supportAxes[1].axisBlock[0], 1, 1); + vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); + } + else {*/ + //didn't transpose 0-1, didn't transpose 1-2 + if (configuration.performR2C == true) { - } - } - uint32_t maxCoordinate = (configuration.matrixConvolution > 1) ? 1 : configuration.coordinateFeatures; + for (int l = localFFTPlan.numSupportAxisUploads[1]-1; l >= 0; l--) { + VkFFTAxis* axis = &localFFTPlan.supportAxes[1][l]; + uint32_t maxCoordinate = ((configuration.matrixConvolution > 1) && (l == 0)) ? 1 : configuration.coordinateFeatures; for (uint32_t i = 0; i < maxCoordinate; i++) { - localFFTPlan.axes[2].pushConstants.coordinate = i; - localFFTPlan.axes[2].pushConstants.batch = configuration.numberKernels; - vkCmdPushConstants(commandBuffer, localFFTPlan.axes[2].pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &localFFTPlan.axes[2].pushConstants); - vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.axes[2].pipeline); - vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.axes[2].pipelineLayout, 0, 1, &localFFTPlan.axes[2].descriptorSet, 0, NULL); - if (configuration.performR2C == true) - vkCmdDispatch(commandBuffer, configuration.size[0] / 2 / localFFTPlan.axes[2].axisBlock[0], 1, configuration.size[1]); - else - vkCmdDispatch(commandBuffer, configuration.size[0] / localFFTPlan.axes[2].axisBlock[0], 1, configuration.size[1]); + axis->pushConstants.coordinate = i; + + axis->pushConstants.batch = ((l == 0) && (configuration.matrixConvolution == 1)) ? configuration.numberKernels : 0; + + vkCmdPushConstants(commandBuffer, axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &axis->pushConstants); + vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL); + vkCmdDispatch(commandBuffer, configuration.size[1] / axis->axisBlock[0]* configuration.size[2] / axis->specializationConstants.fftDim, 1, 1); + } - vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); + if (l >=0) + vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); + } } + + for (int l= localFFTPlan.numAxisUploads[2]-1; l >=0; l--) { + + VkFFTAxis* axis = &localFFTPlan.axes[2][l]; + uint32_t maxCoordinate = ((configuration.matrixConvolution > 1) && (l == 0)) ? 1 : configuration.coordinateFeatures; + for (uint32_t i = 0; i < maxCoordinate; i++) { + axis->pushConstants.coordinate = i; + axis->pushConstants.batch = ((l == 0) && (configuration.matrixConvolution == 1)) ? configuration.numberKernels : 0; + + vkCmdPushConstants(commandBuffer, axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &axis->pushConstants); + vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL); + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, ceil(configuration.size[0] / 2.0) / axis->axisBlock[0] * configuration.size[2] / axis->specializationConstants.fftDim, 1, configuration.size[1]); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->axisBlock[0] * configuration.size[2] / axis->specializationConstants.fftDim, 1, configuration.size[1]); + } + vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); + + } + //} + //} } else { //transposed 1-2, transposed 0-1 - if (configuration.performTranspose[1]) { + /*if (configuration.performTranspose[1]) { for (uint32_t j = 0; j < configuration.numberBatches; j++) { localFFTPlan.axes[2].pushConstants.batch = j; for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { @@ -2168,39 +3031,9 @@ typedef struct VkFFTApplication { vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); } - else { - if (configuration.performTranspose[0]) { - //transposed 0-1, didn't transpose 1-2 - for (uint32_t j = 0; j < configuration.numberBatches; j++) { - localFFTPlan.axes[2].pushConstants.batch = j; - for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { - localFFTPlan.axes[2].pushConstants.coordinate = i; - vkCmdPushConstants(commandBuffer, localFFTPlan.axes[2].pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &localFFTPlan.axes[2].pushConstants); - vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.axes[2].pipeline); - vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.axes[2].pipelineLayout, 0, 1, &localFFTPlan.axes[2].descriptorSet, 0, NULL); - if (configuration.performR2C == true) - vkCmdDispatch(commandBuffer, configuration.size[1] / localFFTPlan.axes[2].axisBlock[0], 1, configuration.size[0] / 2 + 1); - else - vkCmdDispatch(commandBuffer, configuration.size[1] / localFFTPlan.axes[2].axisBlock[0], 1, configuration.size[0]); - } - } - vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); - - } - else { - //didn't transpose 0-1, didn't transpose 1-2 - if (configuration.performR2C == true) { - for (uint32_t j = 0; j < configuration.numberBatches; j++) { - localFFTPlan.supportAxes[1].pushConstants.batch = j; - for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { - localFFTPlan.supportAxes[1].pushConstants.coordinate = i; - vkCmdPushConstants(commandBuffer, localFFTPlan.supportAxes[1].pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &localFFTPlan.supportAxes[1].pushConstants); - vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.supportAxes[1].pipeline); - vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.supportAxes[1].pipelineLayout, 0, 1, &localFFTPlan.supportAxes[1].descriptorSet, 0, NULL); - vkCmdDispatch(commandBuffer, configuration.size[1] / localFFTPlan.supportAxes[1].axisBlock[0], 1, 1); - } - } - } + else { + if (configuration.performTranspose[0]) { + //transposed 0-1, didn't transpose 1-2 for (uint32_t j = 0; j < configuration.numberBatches; j++) { localFFTPlan.axes[2].pushConstants.batch = j; for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { @@ -2209,23 +3042,66 @@ typedef struct VkFFTApplication { vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.axes[2].pipeline); vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.axes[2].pipelineLayout, 0, 1, &localFFTPlan.axes[2].descriptorSet, 0, NULL); if (configuration.performR2C == true) - vkCmdDispatch(commandBuffer, configuration.size[0] / 2 / localFFTPlan.axes[2].axisBlock[0], 1, configuration.size[1]); + vkCmdDispatch(commandBuffer, configuration.size[1] / localFFTPlan.axes[2].axisBlock[0], 1, configuration.size[0] / 2 + 1); else - vkCmdDispatch(commandBuffer, configuration.size[0] / localFFTPlan.axes[2].axisBlock[0], 1, configuration.size[1]); + vkCmdDispatch(commandBuffer, configuration.size[1] / localFFTPlan.axes[2].axisBlock[0], 1, configuration.size[0]); + } + } + vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); + + } + else {*/ + //didn't transpose 0-1, didn't transpose 1-2 + if (configuration.performR2C == true) { + for (uint32_t j = 0; j < configuration.numberBatches; j++) { + for (int l = localFFTPlan.numSupportAxisUploads[1]-1; l >= 0; l--) { + VkFFTAxis* axis = &localFFTPlan.supportAxes[1][l]; + axis->pushConstants.batch = j; + for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { + axis->pushConstants.coordinate = i; + + vkCmdPushConstants(commandBuffer, axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &axis->pushConstants); + vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL); + vkCmdDispatch(commandBuffer, configuration.size[1] / axis->axisBlock[0] * configuration.size[2] / axis->specializationConstants.fftDim, 1, 1); + } + if (l >= 0) + vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); + + } + } + } + for (uint32_t j = 0; j < configuration.numberBatches; j++) { + for (int l = localFFTPlan.numAxisUploads[2]-1; l >=0; l--) { + VkFFTAxis* axis = &localFFTPlan.axes[2][l]; + axis->pushConstants.batch = j; + for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { + axis->pushConstants.coordinate = i; + vkCmdPushConstants(commandBuffer, axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &axis->pushConstants); + vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL); + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, ceil(configuration.size[0] / 2.0) / axis->axisBlock[0] * configuration.size[2] / axis->specializationConstants.fftDim, 1, configuration.size[1]); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->axisBlock[0] * configuration.size[2] / axis->specializationConstants.fftDim, 1, configuration.size[1]); } vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); } } + + //} + //} } } } if (configuration.performConvolution) { if (configuration.FFTdim > 2) { + //transpose 1-2, after 0-1 - if (configuration.performTranspose[1]) { + /*if (configuration.performTranspose[1]) { for (uint32_t j = 0; j < configuration.numberKernels; j++) { localFFTPlan_inverse_convolution.transpose[1].pushConstants.batch = j; for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { @@ -2278,51 +3154,112 @@ typedef struct VkFFTApplication { } vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); } - else { - + else {*/ + //multiple upload ifft leftovers + if (configuration.FFTdim == 3) { if (configuration.performR2C == true) { for (uint32_t j = 0; j < configuration.numberKernels; j++) { - localFFTPlan_inverse_convolution.supportAxes[0].pushConstants.batch = j; + for (int l = 1; l< localFFTPlan_inverse_convolution.numSupportAxisUploads[1]; l++) { + VkFFTAxis* axis = &localFFTPlan_inverse_convolution.supportAxes[1][l]; + uint32_t maxCoordinate = configuration.coordinateFeatures; + for (uint32_t i = 0; i < maxCoordinate; i++) { + axis->pushConstants.coordinate = i; + axis->pushConstants.batch = j; + vkCmdPushConstants(commandBuffer, axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &axis->pushConstants); + vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL); + vkCmdDispatch(commandBuffer, configuration.size[1] / axis->axisBlock[0] * configuration.size[2] / axis->specializationConstants.fftDim, 1, 1); + + } + if (l > 0) + vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); + + } + } + } + for (uint32_t j = 0; j < configuration.numberKernels; j++) { + for (int l = 1; l < localFFTPlan_inverse_convolution.numAxisUploads[2]; l++) { + VkFFTAxis* axis = &localFFTPlan_inverse_convolution.axes[2][l]; + uint32_t maxCoordinate = configuration.coordinateFeatures; + for (uint32_t i = 0; i < maxCoordinate; i++) { + axis->pushConstants.coordinate = i; + axis->pushConstants.batch = j; + vkCmdPushConstants(commandBuffer, axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &axis->pushConstants); + vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL); + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, ceil(configuration.size[0] / 2.0) / axis->axisBlock[0] * configuration.size[2] / axis->specializationConstants.fftDim, 1, configuration.size[1]); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->axisBlock[0] * configuration.size[2] / axis->specializationConstants.fftDim, 1, configuration.size[1]); + } + vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); + + } + } + } + if (configuration.performR2C == true) { + for (uint32_t j = 0; j < configuration.numberKernels; j++) { + for (int l = localFFTPlan_inverse_convolution.numSupportAxisUploads[0]-1; l >=0; l--) { + VkFFTAxis* axis = &localFFTPlan_inverse_convolution.supportAxes[0][l]; + axis->pushConstants.batch = j; for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { - localFFTPlan_inverse_convolution.supportAxes[0].pushConstants.coordinate = i; - vkCmdPushConstants(commandBuffer, localFFTPlan_inverse_convolution.supportAxes[0].pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &localFFTPlan_inverse_convolution.supportAxes[0].pushConstants); - vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan_inverse_convolution.supportAxes[0].pipeline); - vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan_inverse_convolution.supportAxes[0].pipelineLayout, 0, 1, &localFFTPlan_inverse_convolution.supportAxes[0].descriptorSet, 0, NULL); - if (configuration.performZeropadding[2]) { - vkCmdDispatch(commandBuffer, 1, 1, ceil(configuration.size[2] / 2.0)); + axis->pushConstants.coordinate = i; + vkCmdPushConstants(commandBuffer, axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &axis->pushConstants); + vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL); + if (l == 0) { + if (configuration.performZeropadding[2]) { + vkCmdDispatch(commandBuffer, configuration.size[1] / axis->specializationConstants.fftDim, 1, ceil(configuration.size[2] / 2.0)); + } + else { + vkCmdDispatch(commandBuffer, configuration.size[1] / axis->specializationConstants.fftDim, 1, configuration.size[2]); + } } else { - vkCmdDispatch(commandBuffer, 1, 1, configuration.size[2]); + if (configuration.performZeropadding[2]) { + vkCmdDispatch(commandBuffer, configuration.size[1] / axis->specializationConstants.fftDim / axis->axisBlock[0], 1, ceil(configuration.size[2] / 2.0)); + } + else { + vkCmdDispatch(commandBuffer, configuration.size[1] / axis->specializationConstants.fftDim / axis->axisBlock[0], 1, configuration.size[2]); + } } } + if (l >= 0) + vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); + } } - for (uint32_t j = 0; j < configuration.numberKernels; j++) { - localFFTPlan_inverse_convolution.axes[1].pushConstants.batch = j; + } + for (uint32_t j = 0; j < configuration.numberKernels; j++) { + for (int l = localFFTPlan_inverse_convolution.numAxisUploads[1]-1; l >= 0; l--) { + VkFFTAxis* axis = &localFFTPlan_inverse_convolution.axes[1][l]; + axis->pushConstants.batch = j; for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { - localFFTPlan_inverse_convolution.axes[1].pushConstants.coordinate = i; - vkCmdPushConstants(commandBuffer, localFFTPlan_inverse_convolution.axes[1].pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &localFFTPlan_inverse_convolution.axes[1].pushConstants); - vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan_inverse_convolution.axes[1].pipeline); - vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan_inverse_convolution.axes[1].pipelineLayout, 0, 1, &localFFTPlan_inverse_convolution.axes[1].descriptorSet, 0, NULL); + axis->pushConstants.coordinate = i; + vkCmdPushConstants(commandBuffer, axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &axis->pushConstants); + vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL); if (configuration.performZeropadding[2]) { if (configuration.performR2C == true) - vkCmdDispatch(commandBuffer, configuration.size[0] / 2 / localFFTPlan_inverse_convolution.axes[1].axisBlock[0], 1, ceil(configuration.size[2] / 2.0 / localFFTPlan_inverse_convolution.axes[1].axisBlock[2])); + vkCmdDispatch(commandBuffer, ceil(configuration.size[0] / 2.0) / axis->axisBlock[0] * configuration.size[1] / axis->specializationConstants.fftDim, 1, ceil(configuration.size[2] / 2.0)); else - vkCmdDispatch(commandBuffer, configuration.size[0] / localFFTPlan_inverse_convolution.axes[1].axisBlock[0], 1, ceil(configuration.size[2] / 2.0 / localFFTPlan_inverse_convolution.axes[1].axisBlock[2])); + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->axisBlock[0] * configuration.size[1] / axis->specializationConstants.fftDim, 1, ceil(configuration.size[2] / 2.0)); } else { if (configuration.performR2C == true) - vkCmdDispatch(commandBuffer, configuration.size[0] / 2 / localFFTPlan_inverse_convolution.axes[1].axisBlock[0], 1, configuration.size[2] / localFFTPlan_inverse_convolution.axes[1].axisBlock[2]); + vkCmdDispatch(commandBuffer, ceil(configuration.size[0] / 2.0) / axis->axisBlock[0] * configuration.size[1] / axis->specializationConstants.fftDim, 1, configuration.size[2] ); else - vkCmdDispatch(commandBuffer, configuration.size[0] / localFFTPlan_inverse_convolution.axes[1].axisBlock[0], 1, configuration.size[2] / localFFTPlan_inverse_convolution.axes[1].axisBlock[2]); + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->axisBlock[0] * configuration.size[1] / axis->specializationConstants.fftDim, 1, configuration.size[2] ); } } - } - vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); + vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); + } } + + //} @@ -2330,7 +3267,7 @@ typedef struct VkFFTApplication { } if (configuration.FFTdim > 1) { // transpose 0 - 1, if needed - if (configuration.performTranspose[0]) { + /*if (configuration.performTranspose[0]) { for (uint32_t j = 0; j < configuration.numberKernels; j++) { localFFTPlan_inverse_convolution.transpose[0].pushConstants.batch = j; for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { @@ -2356,50 +3293,183 @@ typedef struct VkFFTApplication { } vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); + }*/ + if (configuration.FFTdim == 2) { + if (configuration.performR2C == true) { + for (uint32_t j = 0; j < configuration.numberKernels; j++) { + for (int l = 1; l< localFFTPlan_inverse_convolution.numSupportAxisUploads[0]; l++) { + VkFFTAxis* axis = &localFFTPlan_inverse_convolution.supportAxes[0][l]; + uint32_t maxCoordinate = configuration.coordinateFeatures; + for (uint32_t i = 0; i < maxCoordinate; i++) { + axis->pushConstants.coordinate = i; + axis->pushConstants.batch = j; + vkCmdPushConstants(commandBuffer, axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &axis->pushConstants); + vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL); + if (l == 0) { + if (configuration.performZeropadding[2]) { + vkCmdDispatch(commandBuffer, configuration.size[1] / axis->specializationConstants.fftDim, 1, ceil(configuration.size[2] / 2.0)); + } + else { + vkCmdDispatch(commandBuffer, configuration.size[1] / axis->specializationConstants.fftDim, 1, configuration.size[2]); + } + } + else { + if (configuration.performZeropadding[2]) { + vkCmdDispatch(commandBuffer, configuration.size[1] / axis->specializationConstants.fftDim / axis->axisBlock[0], 1, ceil(configuration.size[2] / 2.0)); + } + else { + vkCmdDispatch(commandBuffer, configuration.size[1] / axis->specializationConstants.fftDim / axis->axisBlock[0], 1, configuration.size[2]); + } + } + } + if (l > 0) + vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); + + } + } + + } + for (uint32_t j = 0; j < configuration.numberKernels; j++) { + for (int l = 1; l< localFFTPlan_inverse_convolution.numAxisUploads[1]; l++) { + VkFFTAxis* axis = &localFFTPlan_inverse_convolution.axes[1][l]; + uint32_t maxCoordinate = configuration.coordinateFeatures; + for (uint32_t i = 0; i < maxCoordinate; i++) { + + axis->pushConstants.coordinate = i; + axis->pushConstants.batch = j; + vkCmdPushConstants(commandBuffer, axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &axis->pushConstants); + vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL); + if (configuration.performZeropadding[2]) { + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, ceil(configuration.size[0] / 2.0) / axis->axisBlock[0] * configuration.size[1] / axis->specializationConstants.fftDim, 1, ceil(configuration.size[2] / 2.0)); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->axisBlock[0] * configuration.size[1] / axis->specializationConstants.fftDim, 1, ceil(configuration.size[2] / 2.0)); + } + else { + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, ceil(configuration.size[0] / 2.0) / axis->axisBlock[0] * configuration.size[1] / axis->specializationConstants.fftDim, 1, configuration.size[2]); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->axisBlock[0] * configuration.size[1] / axis->specializationConstants.fftDim, 1, configuration.size[2]); + + } + } + vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); + + } + } } for (uint32_t j = 0; j < configuration.numberKernels; j++) { - localFFTPlan_inverse_convolution.axes[0].pushConstants.batch = j; - for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { - localFFTPlan_inverse_convolution.axes[0].pushConstants.coordinate = i; - vkCmdPushConstants(commandBuffer, localFFTPlan_inverse_convolution.axes[0].pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &localFFTPlan_inverse_convolution.axes[0].pushConstants); - vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan_inverse_convolution.axes[0].pipeline); - vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan_inverse_convolution.axes[0].pipelineLayout, 0, 1, &localFFTPlan_inverse_convolution.axes[0].descriptorSet, 0, NULL); - if (configuration.performZeropadding[1]) { - if (configuration.performZeropadding[2]) { - if (configuration.performR2C == true) - vkCmdDispatch(commandBuffer, 1, ceil(configuration.size[1] / 2.0 / 2.0 / localFFTPlan_inverse_convolution.axes[0].axisBlock[1]), ceil(configuration.size[2] / 2.0 / localFFTPlan_inverse_convolution.axes[0].axisBlock[2])); - else - vkCmdDispatch(commandBuffer, 1, ceil(configuration.size[1] / 2.0 / localFFTPlan_inverse_convolution.axes[0].axisBlock[1]), ceil(configuration.size[2] / 2.0 / localFFTPlan_inverse_convolution.axes[0].axisBlock[2])); + for (int l = localFFTPlan_inverse_convolution.numAxisUploads[0]-1; l >= 0; l--) { + VkFFTAxis* axis = &localFFTPlan_inverse_convolution.axes[0][l]; + axis->pushConstants.batch = j; + for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { + axis->pushConstants.coordinate = i; + vkCmdPushConstants(commandBuffer, axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &axis->pushConstants); + vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL); + if (l == 0) { + if (configuration.performZeropadding[1]) { + if (configuration.performZeropadding[2]) { + + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim, ceil(configuration.size[1] / 2.0 / 2.0), ceil(configuration.size[2] / 2.0)); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim, ceil(configuration.size[1] / 2.0), ceil(configuration.size[2] / 2.0)); + } + else { + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim, ceil(configuration.size[1] / 2.0 / 2.0), configuration.size[2]); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim, ceil(configuration.size[1] / 2.0), configuration.size[2]); + } + } + else { + if (configuration.performZeropadding[2]) { + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim, ceil(configuration.size[1] / 2.0), ceil(configuration.size[2] / 2.0)); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim, configuration.size[1], ceil(configuration.size[2] / 2.0)); + } + else { + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim, ceil(configuration.size[1] / 2.0), configuration.size[2]); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim, configuration.size[1], configuration.size[2]); + } + } } else { - if (configuration.performR2C == true) - vkCmdDispatch(commandBuffer, 1, ceil(configuration.size[1] / 2.0 / 2.0 / localFFTPlan_inverse_convolution.axes[0].axisBlock[1]), configuration.size[2] / localFFTPlan_inverse_convolution.axes[0].axisBlock[2]); - else - vkCmdDispatch(commandBuffer, 1, ceil(configuration.size[1] / 2.0 / localFFTPlan_inverse_convolution.axes[0].axisBlock[1]), configuration.size[2] / localFFTPlan_inverse_convolution.axes[0].axisBlock[2]); + if (configuration.performZeropadding[1]) { + if (configuration.performZeropadding[2]) { + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim / axis->axisBlock[0], ceil(configuration.size[1] / 2.0 / 2.0), ceil(configuration.size[2] / 2.0)); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim / axis->axisBlock[0], ceil(configuration.size[1] / 2.0), ceil(configuration.size[2] / 2.0)); + } + else { + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim / axis->axisBlock[0], ceil(configuration.size[1] / 2.0 / 2.0), configuration.size[2]); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim / axis->axisBlock[0], ceil(configuration.size[1] / 2.0), configuration.size[2]); + } + } + else { + if (configuration.performZeropadding[2]) { + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim / axis->axisBlock[0], ceil(configuration.size[1] / 2.0), ceil(configuration.size[2] / 2.0)); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim / axis->axisBlock[0], configuration.size[1], ceil(configuration.size[2] / 2.0)); + } + else { + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim / axis->axisBlock[0], ceil(configuration.size[1] / 2.0), configuration.size[2]); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim / axis->axisBlock[0], configuration.size[1], configuration.size[2]); + } + } } + } - else { + vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); + + } + } + + + } + if (configuration.FFTdim == 1) { + for (uint32_t j = 0; j < configuration.numberKernels; j++) { + for (int l = 1; l < localFFTPlan_inverse_convolution.numAxisUploads[0]; l++) { + VkFFTAxis* axis = &localFFTPlan_inverse_convolution.axes[0][l]; + uint32_t maxCoordinate = configuration.coordinateFeatures; + for (uint32_t i = 0; i < maxCoordinate; i++) { + + axis->pushConstants.coordinate = i; + axis->pushConstants.batch = j; + vkCmdPushConstants(commandBuffer, axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &axis->pushConstants); + vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL); if (configuration.performZeropadding[2]) { if (configuration.performR2C == true) - vkCmdDispatch(commandBuffer, 1, configuration.size[1] / 2 / localFFTPlan_inverse_convolution.axes[0].axisBlock[1], ceil(configuration.size[2] / 2.0 / localFFTPlan_inverse_convolution.axes[0].axisBlock[2])); + vkCmdDispatch(commandBuffer, ceil(configuration.size[0] / 2.0) / axis->axisBlock[0] * configuration.size[1] / axis->specializationConstants.fftDim, 1, ceil(configuration.size[2] / 2.0)); else - vkCmdDispatch(commandBuffer, 1, configuration.size[1] / localFFTPlan_inverse_convolution.axes[0].axisBlock[1], ceil(configuration.size[2] / 2.0 / localFFTPlan_inverse_convolution.axes[0].axisBlock[2])); + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->axisBlock[0] * configuration.size[1] / axis->specializationConstants.fftDim, 1, ceil(configuration.size[2] / 2.0)); } else { if (configuration.performR2C == true) - vkCmdDispatch(commandBuffer, 1, configuration.size[1] / 2 / localFFTPlan_inverse_convolution.axes[0].axisBlock[1], configuration.size[2] / localFFTPlan_inverse_convolution.axes[0].axisBlock[2]); + vkCmdDispatch(commandBuffer, ceil(configuration.size[0] / 2.0) / axis->axisBlock[0] * configuration.size[1] / axis->specializationConstants.fftDim, 1, configuration.size[2]); else - vkCmdDispatch(commandBuffer, 1, configuration.size[1] / localFFTPlan_inverse_convolution.axes[0].axisBlock[1], configuration.size[2] / localFFTPlan_inverse_convolution.axes[0].axisBlock[2]); + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->axisBlock[0] * configuration.size[1] / axis->specializationConstants.fftDim, 1, configuration.size[2]); } } + vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); } } - vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); - - } } @@ -2408,7 +3478,7 @@ typedef struct VkFFTApplication { //FFT axis 2 if (configuration.FFTdim > 2) { //transposed 1-2, transposed 0-1 - if (configuration.performTranspose[1]) { + /*if (configuration.performTranspose[1]) { for (uint32_t j = 0; j < configuration.numberBatches; j++) { localFFTPlan.axes[2].pushConstants.batch = j; for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { @@ -2444,39 +3514,52 @@ typedef struct VkFFTApplication { vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); } - else { + else {*/ //didn't transpose 0-1, didn't transpose 1-2 - if (configuration.performR2C == true) { - for (uint32_t j = 0; j < configuration.numberBatches; j++) { - localFFTPlan.supportAxes[1].pushConstants.batch = j; - for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { - localFFTPlan.supportAxes[1].pushConstants.coordinate = i; - vkCmdPushConstants(commandBuffer, localFFTPlan.supportAxes[1].pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &localFFTPlan.supportAxes[1].pushConstants); - vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.supportAxes[1].pipeline); - vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.supportAxes[1].pipelineLayout, 0, 1, &localFFTPlan.supportAxes[1].descriptorSet, 0, NULL); - vkCmdDispatch(commandBuffer, configuration.size[1] / localFFTPlan.supportAxes[1].axisBlock[0], 1, 1); - } - } - } - for (uint32_t j = 0; j < configuration.numberBatches; j++) { - localFFTPlan.axes[2].pushConstants.batch = j; + if (configuration.performR2C == true) { + for (uint32_t j = 0; j < configuration.numberBatches; j++) { + for (int l = localFFTPlan.numSupportAxisUploads[1]-1; l >=0; l--) { + VkFFTAxis* axis = &localFFTPlan.supportAxes[1][l]; + axis->pushConstants.batch = j; for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { - localFFTPlan.axes[2].pushConstants.coordinate = i; - vkCmdPushConstants(commandBuffer, localFFTPlan.axes[2].pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &localFFTPlan.axes[2].pushConstants); - vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.axes[2].pipeline); - vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.axes[2].pipelineLayout, 0, 1, &localFFTPlan.axes[2].descriptorSet, 0, NULL); - if (configuration.performR2C == true) - vkCmdDispatch(commandBuffer, configuration.size[0] / 2 / localFFTPlan.axes[2].axisBlock[0], 1, configuration.size[1]); - else - vkCmdDispatch(commandBuffer, configuration.size[0] / localFFTPlan.axes[2].axisBlock[0], 1, configuration.size[1]); + axis->pushConstants.coordinate = i; + + vkCmdPushConstants(commandBuffer, axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &axis->pushConstants); + vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL); + vkCmdDispatch(commandBuffer, configuration.size[1] / axis->axisBlock[0] * configuration.size[2] / axis->specializationConstants.fftDim, 1, 1); + } + if (l >0) + vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); + + } + } + } + + for (uint32_t j = 0; j < configuration.numberBatches; j++) { + for (int l = localFFTPlan.numAxisUploads[2]-1; l >=0; l--) { + VkFFTAxis* axis = &localFFTPlan.axes[2][l]; + axis->pushConstants.batch = j; + for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { + axis->pushConstants.coordinate = i; + vkCmdPushConstants(commandBuffer, axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &axis->pushConstants); + vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL); + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, ceil(configuration.size[0] / 2.0) / axis->axisBlock[0] * configuration.size[2] / axis->specializationConstants.fftDim, 1, configuration.size[1]); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->axisBlock[0] * configuration.size[2] / axis->specializationConstants.fftDim, 1, configuration.size[1]); } vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); } } + + //} + //} //transpose 1-2, after 0-1 - if (configuration.performTranspose[1]) { + /*if (configuration.performTranspose[1]) { for (uint32_t j = 0; j < configuration.numberBatches; j++) { localFFTPlan.transpose[1].pushConstants.batch = j; for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { @@ -2502,13 +3585,13 @@ typedef struct VkFFTApplication { } vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); - } + }*/ } if (configuration.FFTdim > 1) { //FFT axis 1 - if (configuration.performTranspose[0]) { + /*if (configuration.performTranspose[0]) { for (uint32_t j = 0; j < configuration.numberBatches; j++) { localFFTPlan.axes[1].pushConstants.batch = j; for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { @@ -2526,54 +3609,72 @@ typedef struct VkFFTApplication { vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); } - else { - - if (configuration.performR2C == true) { - for (uint32_t j = 0; j < configuration.numberBatches; j++) { - localFFTPlan.supportAxes[0].pushConstants.batch = j; + else {*/ + + if (configuration.performR2C == true) { + for (uint32_t j = 0; j < configuration.numberBatches; j++) { + for (int l = localFFTPlan.numSupportAxisUploads[0]-1; l >= 0; l--) { + VkFFTAxis* axis = &localFFTPlan.supportAxes[0][l]; + axis->pushConstants.batch = j; for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { - - localFFTPlan.supportAxes[0].pushConstants.coordinate = i; - vkCmdPushConstants(commandBuffer, localFFTPlan.supportAxes[0].pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &localFFTPlan.supportAxes[0].pushConstants); - vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.supportAxes[0].pipeline); - vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.supportAxes[0].pipelineLayout, 0, 1, &localFFTPlan.supportAxes[0].descriptorSet, 0, NULL); - if (configuration.performZeropadding[2]) { - vkCmdDispatch(commandBuffer, 1, 1, ceil(configuration.size[2] / 2.0)); + axis->pushConstants.coordinate = i; + vkCmdPushConstants(commandBuffer, axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &axis->pushConstants); + vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL); + if (l == 0) { + if (configuration.performZeropadding[2]) { + vkCmdDispatch(commandBuffer, configuration.size[1] / axis->specializationConstants.fftDim, 1, ceil(configuration.size[2] / 2.0)); + } + else { + vkCmdDispatch(commandBuffer, configuration.size[1] / axis->specializationConstants.fftDim, 1, configuration.size[2]); + } } else { - vkCmdDispatch(commandBuffer, 1, 1, configuration.size[2]); + if (configuration.performZeropadding[2]) { + vkCmdDispatch(commandBuffer, configuration.size[1] / axis->specializationConstants.fftDim / axis->axisBlock[0], 1, ceil(configuration.size[2] / 2.0)); + } + else { + vkCmdDispatch(commandBuffer, configuration.size[1] / axis->specializationConstants.fftDim / axis->axisBlock[0], 1, configuration.size[2]); + } } } + if (l >= 0) + vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); + } } - for (uint32_t j = 0; j < configuration.numberBatches; j++) { - localFFTPlan.axes[1].pushConstants.batch = j; + } + for (uint32_t j = 0; j < configuration.numberBatches; j++) { + for (int l = localFFTPlan.numAxisUploads[1]-1; l >= 0; l--) { + VkFFTAxis* axis = &localFFTPlan.axes[1][l]; + axis->pushConstants.batch = j; for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { - localFFTPlan.axes[1].pushConstants.coordinate = i; - vkCmdPushConstants(commandBuffer, localFFTPlan.axes[1].pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &localFFTPlan.axes[1].pushConstants); - vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.axes[1].pipeline); - vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.axes[1].pipelineLayout, 0, 1, &localFFTPlan.axes[1].descriptorSet, 0, NULL); + axis->pushConstants.coordinate = i; + vkCmdPushConstants(commandBuffer, axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &axis->pushConstants); + vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL); if (configuration.performZeropadding[2]) { if (configuration.performR2C == true) - vkCmdDispatch(commandBuffer, configuration.size[0] / 2 / localFFTPlan.axes[1].axisBlock[0], 1, ceil(configuration.size[2] / 2.0 / localFFTPlan.axes[1].axisBlock[2])); + vkCmdDispatch(commandBuffer, ceil(configuration.size[0] / 2.0) / axis->axisBlock[0] * configuration.size[1] / axis->specializationConstants.fftDim, 1, ceil(configuration.size[2] / 2.0)); else - vkCmdDispatch(commandBuffer, configuration.size[0] / localFFTPlan.axes[1].axisBlock[0], 1, ceil(configuration.size[2] / 2.0 / localFFTPlan.axes[1].axisBlock[2])); + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->axisBlock[0] * configuration.size[1] / axis->specializationConstants.fftDim, 1, ceil(configuration.size[2] / 2.0)); } else { if (configuration.performR2C == true) - vkCmdDispatch(commandBuffer, configuration.size[0] / 2 / localFFTPlan.axes[1].axisBlock[0], 1, configuration.size[2] / localFFTPlan.axes[1].axisBlock[2]); + vkCmdDispatch(commandBuffer, ceil(configuration.size[0] / 2.0) / axis->axisBlock[0] * configuration.size[1] / axis->specializationConstants.fftDim, 1, configuration.size[2]); else - vkCmdDispatch(commandBuffer, configuration.size[0] / localFFTPlan.axes[1].axisBlock[0], 1, configuration.size[2] / localFFTPlan.axes[1].axisBlock[2]); + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->axisBlock[0] * configuration.size[1] / axis->specializationConstants.fftDim, 1, configuration.size[2]); } } - } - vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); + vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); + } } + //} // transpose 0 - 1, if needed - if (configuration.performTranspose[0]) { + /*if (configuration.performTranspose[0]) { for (uint32_t j = 0; j < configuration.numberBatches; j++) { localFFTPlan.transpose[0].pushConstants.batch = j; for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { @@ -2599,74 +3700,118 @@ typedef struct VkFFTApplication { } vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); - } + }*/ } //FFT axis 0 for (uint32_t j = 0; j < configuration.numberBatches; j++) { - localFFTPlan.axes[0].pushConstants.batch = j; - for (uint32_t i = 0; i < configuration.coordinateFeatures; i++) { - localFFTPlan.axes[0].pushConstants.coordinate = i; - vkCmdPushConstants(commandBuffer, localFFTPlan.axes[0].pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &localFFTPlan.axes[0].pushConstants); - vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.axes[0].pipeline); - vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, localFFTPlan.axes[0].pipelineLayout, 0, 1, &localFFTPlan.axes[0].descriptorSet, 0, NULL); - if (configuration.performZeropadding[1]) { - if (configuration.performZeropadding[2]) { - if (configuration.performR2C == true) - vkCmdDispatch(commandBuffer, 1, ceil(configuration.size[1] / 2.0 / 2.0 / localFFTPlan.axes[0].axisBlock[1]), ceil(configuration.size[2] / 2.0 / localFFTPlan.axes[0].axisBlock[2])); - else - vkCmdDispatch(commandBuffer, 1, ceil(configuration.size[1] / 2.0 / localFFTPlan.axes[0].axisBlock[1]), ceil(configuration.size[2] / 2.0 / localFFTPlan.axes[0].axisBlock[2])); - } - else { - if (configuration.performR2C == true) - vkCmdDispatch(commandBuffer, 1, ceil(configuration.size[1] / 2.0 / 2.0 / localFFTPlan.axes[0].axisBlock[1]), configuration.size[2] / localFFTPlan.axes[0].axisBlock[2]); - else - vkCmdDispatch(commandBuffer, 1, ceil(configuration.size[1] / 2.0 / localFFTPlan.axes[0].axisBlock[1]), configuration.size[2] / localFFTPlan.axes[0].axisBlock[2]); - } - } - else { - if (configuration.performZeropadding[2]) { - if (configuration.performR2C == true) - vkCmdDispatch(commandBuffer, 1, configuration.size[1] / 2 / localFFTPlan.axes[0].axisBlock[1], ceil(configuration.size[2] / 2.0 / localFFTPlan.axes[0].axisBlock[2])); - else - vkCmdDispatch(commandBuffer, 1, configuration.size[1] / localFFTPlan.axes[0].axisBlock[1], ceil(configuration.size[2] / 2.0 / localFFTPlan.axes[0].axisBlock[2])); + for (int l = localFFTPlan.numAxisUploads[0]-1; l >=0; l--) { + VkFFTAxis* axis = &localFFTPlan.axes[0][l]; + axis->pushConstants.batch = j; + uint32_t maxCoordinate = ((configuration.matrixConvolution) > 1 && (configuration.performConvolution) && (configuration.FFTdim == 1)) ? 1 : configuration.coordinateFeatures; + for (uint32_t i = 0; i < maxCoordinate; i++) { + axis->pushConstants.coordinate = i; + vkCmdPushConstants(commandBuffer, axis->pipelineLayout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(VkFFTPushConstantsLayout), &axis->pushConstants); + vkCmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipeline); + vkCmdBindDescriptorSets(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE, axis->pipelineLayout, 0, 1, &axis->descriptorSet, 0, NULL); + if (l == 0) { + if (configuration.performZeropadding[1]) { + if (configuration.performZeropadding[2]) { + + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim, ceil(configuration.size[1] / 2.0 / 2.0), ceil(configuration.size[2] / 2.0)); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim, ceil(configuration.size[1] / 2.0), ceil(configuration.size[2] / 2.0)); + } + else { + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim, ceil(configuration.size[1] / 2.0 / 2.0), configuration.size[2]); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim, ceil(configuration.size[1] / 2.0), configuration.size[2]); + } + } + else { + if (configuration.performZeropadding[2]) { + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim, ceil(configuration.size[1] / 2.0), ceil(configuration.size[2] / 2.0)); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim, configuration.size[1], ceil(configuration.size[2] / 2.0)); + } + else { + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim, ceil(configuration.size[1] / 2.0), configuration.size[2]); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim, configuration.size[1], configuration.size[2]); + } + } } else { - if (configuration.performR2C == true) - vkCmdDispatch(commandBuffer, 1, configuration.size[1] / 2 / localFFTPlan.axes[0].axisBlock[1], configuration.size[2] / localFFTPlan.axes[0].axisBlock[2]); - else - vkCmdDispatch(commandBuffer, 1, configuration.size[1] / localFFTPlan.axes[0].axisBlock[1], configuration.size[2] / localFFTPlan.axes[0].axisBlock[2]); + if (configuration.performZeropadding[1]) { + if (configuration.performZeropadding[2]) { + + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim / axis->axisBlock[0], ceil(configuration.size[1] / 2.0 / 2.0), ceil(configuration.size[2] / 2.0)); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim / axis->axisBlock[0], ceil(configuration.size[1] / 2.0), ceil(configuration.size[2] / 2.0)); + } + else { + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim / axis->axisBlock[0], ceil(configuration.size[1] / 2.0 / 2.0), configuration.size[2]); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim / axis->axisBlock[0], ceil(configuration.size[1] / 2.0), configuration.size[2]); + } + } + else { + if (configuration.performZeropadding[2]) { + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim / axis->axisBlock[0], ceil(configuration.size[1] / 2.0), ceil(configuration.size[2] / 2.0)); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim / axis->axisBlock[0], configuration.size[1], ceil(configuration.size[2] / 2.0)); + } + else { + if (configuration.performR2C == true) + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim / axis->axisBlock[0], ceil(configuration.size[1] / 2.0), configuration.size[2]); + else + vkCmdDispatch(commandBuffer, configuration.size[0] / axis->specializationConstants.fftDim / axis->axisBlock[0], configuration.size[1], configuration.size[2]); + } + } } } + vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); } } - vkCmdPipelineBarrier(commandBuffer, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, 1, &memory_barrier, 0, NULL, 0, NULL); - + } } void deleteVulkanFFT() { for (uint32_t i = 0; i < configuration.FFTdim; i++) { - deleteAxis(&localFFTPlan.axes[i]); + for (uint32_t j = 0; j < localFFTPlan.numAxisUploads[i]; j++) + deleteAxis(&localFFTPlan.axes[i][j]); } - for (uint32_t i = 0; i < 2; i++) { + for (uint32_t i = 0; i < configuration.FFTdim-1; i++) { if (configuration.performTranspose[i]) deleteTranspose(&localFFTPlan.transpose[i]); - else - deleteAxis(&localFFTPlan.supportAxes[i]); + else { + for (uint32_t j = 0; j < localFFTPlan.numSupportAxisUploads[i]; j++) + deleteAxis(&localFFTPlan.supportAxes[i][j]); + } } if (configuration.performConvolution) { for (uint32_t i = 0; i < configuration.FFTdim; i++) { - deleteAxis(&localFFTPlan_inverse_convolution.axes[i]); + for (uint32_t j = 0; j < localFFTPlan_inverse_convolution.numAxisUploads[i]; j++) + deleteAxis(&localFFTPlan_inverse_convolution.axes[i][j]); } for (uint32_t i = 0; i < configuration.FFTdim - 1; i++) { if (configuration.performTranspose[i]) deleteTranspose(&localFFTPlan_inverse_convolution.transpose[i]); - else - deleteAxis(&localFFTPlan_inverse_convolution.supportAxes[i]); + else { + for (uint32_t j = 0; j < localFFTPlan_inverse_convolution.numSupportAxisUploads[i]; j++) + deleteAxis(&localFFTPlan_inverse_convolution.supportAxes[i][j]); + } } } }