go BLAS #17

zmonoid opened this issue Oct 24, 2018 · 2 comments


zmonoid commented Oct 24, 2018


I am trying to convert openblas to go with this project.

A simple C code:

// degmm.c
#include <cblas.h>

void degmm()
  int i=0;
  double A[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0};
  double B[6] = {1.0,2.0,1.0,-3.0,4.0,-1.0};
  double C[9] = {.5,.5,.5,.5,.5,.5,.5,.5,.5};
  cblas_dgemm(CblasColMajor, CblasNoTrans, CblasTrans,3,3,2,1,A, 3, B, 3,2,C,3);

int main () {

Use the following command:

clang-6.0 -O3 -mavx -mfma -masm=intel -fno-asynchronous-unwind-tables -fno-exceptions -fno-rtti -S degmm.c

Will give the following asm code degmm.s:

	.intel_syntax noprefix
	.file	"degmm.c"
	.section	.rodata.cst32,"aM",@progbits,32
	.p2align	5               # -- Begin function degmm
	.quad	4607182418800017408     # double 1
	.quad	4611686018427387904     # double 2
	.quad	4607182418800017408     # double 1
	.quad	-4609434218613702656    # double -3
	.section	.rodata.cst8,"aM",@progbits,8
	.p2align	3
	.quad	4607182418800017408     # double 1
	.quad	4611686018427387904     # double 2
	.globl	degmm
	.p2align	4, 0x90
	.type	degmm,@function
degmm:                                  # @degmm
# %bb.0:
	sub	rsp, 168
	vmovaps	ymm0, ymmword ptr [rip + .LCPI0_0] # ymm0 = [1.000000e+00,2.000000e+00,1.000000e+00,-3.000000e+00]
	vmovups	ymmword ptr [rsp + 48], ymm0
	movabs	rax, 4616189618054758400
	mov	qword ptr [rsp + 80], rax
	movabs	rcx, -4616189618054758400
	mov	qword ptr [rsp + 88], rcx
	vmovups	ymmword ptr [rsp], ymm0
	mov	qword ptr [rsp + 32], rax
	mov	qword ptr [rsp + 40], rcx
	mov	rax, qword ptr [rip + .Ldegmm.C+64]
	mov	qword ptr [rsp + 160], rax
	vmovups	ymm0, ymmword ptr [rip + .Ldegmm.C+32]
	vmovups	ymmword ptr [rsp + 128], ymm0
	vmovups	ymm0, ymmword ptr [rip + .Ldegmm.C]
	vmovups	ymmword ptr [rsp + 96], ymm0
	lea	rax, [rsp + 96]
	mov	r10, rsp
	lea	r11, [rsp + 48]
	vmovsd	xmm0, qword ptr [rip + .LCPI0_1] # xmm0 = mem[0],zero
	vmovsd	xmm1, qword ptr [rip + .LCPI0_2] # xmm1 = mem[0],zero
	mov	edi, 102
	mov	esi, 111
	mov	edx, 112
	mov	ecx, 3
	mov	r8d, 3
	mov	r9d, 2
	push	3
	push	rax
	push	3
	push	r10
	push	3
	push	r11
	call	cblas_dgemm
	add	rsp, 216
	.size	degmm, .Lfunc_end0-degmm
                                        # -- End function
	.section	.rodata.cst32,"aM",@progbits,32
	.p2align	5               # -- Begin function main
	.quad	4607182418800017408     # double 1
	.quad	4611686018427387904     # double 2
	.quad	4607182418800017408     # double 1
	.quad	-4609434218613702656    # double -3
	.section	.rodata.cst8,"aM",@progbits,8
	.p2align	3
	.quad	4607182418800017408     # double 1
	.quad	4611686018427387904     # double 2
	.globl	main
	.p2align	4, 0x90
	.type	main,@function
main:                                   # @main
# %bb.0:
	sub	rsp, 168
	vmovaps	ymm0, ymmword ptr [rip + .LCPI1_0] # ymm0 = [1.000000e+00,2.000000e+00,1.000000e+00,-3.000000e+00]
	vmovups	ymmword ptr [rsp + 48], ymm0
	movabs	rax, 4616189618054758400
	mov	qword ptr [rsp + 80], rax
	movabs	rcx, -4616189618054758400
	mov	qword ptr [rsp + 88], rcx
	vmovups	ymmword ptr [rsp], ymm0
	mov	qword ptr [rsp + 32], rax
	mov	qword ptr [rsp + 40], rcx
	mov	rax, qword ptr [rip + .Ldegmm.C+64]
	mov	qword ptr [rsp + 160], rax
	vmovups	ymm0, ymmword ptr [rip + .Ldegmm.C+32]
	vmovups	ymmword ptr [rsp + 128], ymm0
	vmovups	ymm0, ymmword ptr [rip + .Ldegmm.C]
	vmovups	ymmword ptr [rsp + 96], ymm0
	lea	rax, [rsp + 96]
	mov	r10, rsp
	lea	r11, [rsp + 48]
	vmovsd	xmm0, qword ptr [rip + .LCPI1_1] # xmm0 = mem[0],zero
	vmovsd	xmm1, qword ptr [rip + .LCPI1_2] # xmm1 = mem[0],zero
	mov	edi, 102
	mov	esi, 111
	mov	edx, 112
	mov	ecx, 3
	mov	r8d, 3
	mov	r9d, 2
	push	3
	push	rax
	push	3
	push	r10
	push	3
	push	r11
	call	cblas_dgemm
	add	rsp, 48
	xor	eax, eax
	add	rsp, 168
	.size	main, .Lfunc_end1-main
                                        # -- End function
	.type	.Ldegmm.C,@object       # @degmm.C
	.section	.rodata,"a",@progbits
	.p2align	4
	.quad	4602678819172646912     # double 0.5
	.quad	4602678819172646912     # double 0.5
	.quad	4602678819172646912     # double 0.5
	.quad	4602678819172646912     # double 0.5
	.quad	4602678819172646912     # double 0.5
	.quad	4602678819172646912     # double 0.5
	.quad	4602678819172646912     # double 0.5
	.quad	4602678819172646912     # double 0.5
	.quad	4602678819172646912     # double 0.5
	.size	.Ldegmm.C, 72

	.ident	"clang version 6.0.0-1ubuntu2 (tags/RELEASE_600/final)"
	.section	".note.GNU-stack","",@progbits

Then converting to go asm:

c2goasm -a degmm.s Degmm.s

Report the following error:

Processing cpp/degmm.s
panic: 'sub rsp' found but in unexpected scenario:      sub     rsp, 168

goroutine 1 [running]:
main.(*Epilogue).isPrologueInstruction(0xc0000575b8, 0xc0000171a0, 0xd, 0xd)
        /home/bzhou/go/src/ +0x3d4
main.getPrologueLines(0xc0000d0150, 0x26, 0xeb, 0xc0000575b8, 0x0)
        /home/bzhou/go/src/ +0xb4
main.(*Subroutine).removePrologueLines(0xc000057590, 0xc0000d0000, 0x8d, 0x100, 0x15, 0x3b)
        /home/bzhou/go/src/ +0x87
main.extractSubroutine(0x3a, 0xc0000d0000, 0x8d, 0x100, 0x11, 0xc000017188, 0x5, 0x14, 0x0, 0x0, ...)
        /home/bzhou/go/src/ +0x2cd
main.segmentSource(0xc0000d0000, 0x8d, 0x100, 0x20, 0xf, 0xc)
        /home/bzhou/go/src/ +0x20e
main.process(0xc0000d0000, 0x8d, 0x100, 0xc000017058, 0x8, 0x0, 0x0, 0x0, 0x0, 0x0)
        /home/bzhou/go/src/ +0x80
        /home/bzhou/go/src/ +0x37e

Any suggestion?

Maybe experiment with -mno-red-zone and/or -mstackrealign options?

zmonoid commented Oct 25, 2018

@fwessels Still the same error. I guess it may be the compiler problem. The ASM code I generated is different from the one given in example. May I check your compiler version?

