Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

xe: jit: enable tracing ir -> ngen #2841

Open
wants to merge 9 commits into
base: main
Choose a base branch
from
Open

Conversation

rjoursler
Copy link
Contributor

@rjoursler rjoursler commented Mar 7, 2025

Adds a new tracing step which tracks the ngen assembly emitted by the IR lowering. Produces output like:

[TRACE]  // Prologue
[TRACE]  (W)    and     (1|M0)  r127.2<1>:ud    r0.0<0;1,0>:ud  0xffffffe0:ud
[TRACE]  (W)    and     (1|M0)  r127.0<1>:uw    r0.4<0;1,0>:uw  0xff:uw
[TRACE]  (W)    add     (1|M0)  r127.2<1>:ud    r127.2<0;1,0>:ud        0x40:uw                         {@2}
[TRACE]  (W)    mad     (1|M0)  r127.0<1>:ud    r127.2<0;0>:ud  r127.0<0;0>:uw  0xc0:uw                 {@1}
[TRACE]  (W)    send.ugm        (1|M0)  r1      r127    null:0  0xff000000      0x6228e500              {$0,I@1}
[TRACE]  (W)    add     (1|M0)  r127.0<1>:ud    r127.0<0;1,0>:ud        0x80:uw                         {$0.dst}
[TRACE]  (W)    send.ugm        (1|M0)  r3      r127    null:0  0xff000000      0x6218d500              {$31,I@1}
[TRACE]  (W)    nop
[TRACE]  (W)    nop
[TRACE]  (W)    nop
[TRACE]  (W)    nop
[TRACE]  (W)    nop
[TRACE]  L0:
[TRACE]  (W)    and     (1|M0)  r127.0<1>:ud    r0.0<0;1,0>:ud  0xffffffe0:ud
[TRACE]  (W)    send.ugm        (1|M0)  r4      r127    null:0  0xff000000      0x6218c700              {$1,I@1}
[TRACE]  L1:
[TRACE]  (W)    or      (1|M0)  cr0.0<1>:ud     cr0.0<0;1,0>:ud 0x14c0:uw                               {A@1}
[TRACE]  // IR
[TRACE]  // alloc dst[0] -> r4.7:uq
[TRACE]  // alloc bia[0] -> r4.6:uq
[TRACE]  // alloc wei[0] -> r4.5:uq
[TRACE]  // alloc src[0] -> r4.4:uq
[TRACE]  // alloc c[512] -> r30 - r31; r32 - r33; r34 - r35; r36 - r37
[TRACE]  // alloc b[1024] -> r14 - r29
[TRACE]  // alloc a[512] -> r5 - r12
[TRACE]  // zero_out(c, 512)
[TRACE]  (W)    mov     (32|M0) r30.0<1>:f      0x0:w                                   {A@1}
[TRACE]  (W)    mov     (32|M0) r32.0<1>:f      0x0:w
[TRACE]  (W)    mov     (32|M0) r34.0<1>:f      0x0:w
[TRACE]  (W)    mov     (32|M0) r36.0<1>:f      0x0:w
[TRACE]  // alloc h_0[64] -> r13
[TRACE]  // h_0.u64(0) = u64(src)
[TRACE]  (W)    mov     (1|M0)  r13.0<1>:uq     r4.4<0;1,0>:uq                                  {$1.dst}
[TRACE]  // load.owordx4(src, h_0, a[0], true, (nil), (nil), (nil))
[TRACE]  (W)    send.ugm        (1|M0)  r5      r13     null:0  0x0     0x218c780               {$1,I@1}
[TRACE]  // alloc h_1[64] -> r13
[TRACE]  // h_1.u64(0) = (u64(src) + 64)
[TRACE]  (W)    mov     (1|M0)  r4.0<1>:uq      r4.4<0;1,0>:uq
[TRACE]  (W)    add     (1|M0)  r13.0<1>:uq     r4.0<0;1,0>:uq  0x40:w                          {@1}
[TRACE]  // load.owordx16(src, h_1, a[64], false, (nil), (nil), (nil))
[TRACE]  (W)    mov     (32|M0) r6.0<1>:f       0x0:w
[TRACE]  (W)    mov     (32|M0) r8.0<1>:f       0x0:w
[TRACE]  // alloc h_2[64] -> r13
[TRACE]  // h_2.u64(0) = (u64(src) + 320)
[TRACE]  (W)    mov     (1|M0)  r4.0<1>:uq      r4.4<0;1,0>:uq
[TRACE]  (W)    add     (1|M0)  r13.0<1>:uq     r4.0<0;1,0>:uq  0x140:w                         {@1}
[TRACE]  // load.owordx8(src, h_2, a[320], false, (nil), (nil), (nil))
[TRACE]  (W)    mov     (32|M0) r10.0<1>:f      0x0:w
[TRACE]  // alloc h_3[64] -> r13
[TRACE]  // h_3.u64(0) = (u64(src) + 448)
[TRACE]  (W)    mov     (1|M0)  r4.0<1>:uq      r4.4<0;1,0>:uq
[TRACE]  (W)    add     (1|M0)  r13.0<1>:uq     r4.0<0;1,0>:uq  0x1c0:w                         {@1}
[TRACE]  // load.owordx4(src, h_3, a[448], false, (nil), (nil), (nil))
[TRACE]  (W)    mov     (16|M0) r12.0<1>:f      0x0:w
[TRACE]  // alloc h_4[64] -> r13
[TRACE]  // h_4.s32(2) = 63
[TRACE]  (W)    mov     (1|M0)  r13.2<1>:d      0x3f:w
[TRACE]  // h_4.s32(3) = 15
[TRACE]  (W)    mov     (1|M0)  r13.3<1>:d      0xf:w
[TRACE]  // h_4.s32(4) = 63
[TRACE]  (W)    mov     (1|M0)  r13.4<1>:d      0x3f:w
[TRACE]  // h_4.s32(5) = 0
[TRACE]  (W)    mov     (1|M0)  r13.5<1>:d      0x0:w
[TRACE]  // h_4.s32(6) = 0
[TRACE]  (W)    mov     (1|M0)  r13.6<1>:d      0x0:w
[TRACE]  // h_4.s32(7) = 3855
[TRACE]  (W)    mov     (1|M0)  r13.7<1>:d      0xf0f:w
[TRACE]  // h_4.u64(0) = u64(wei)
[TRACE]  (W)    mov     (1|M0)  r13.0<1>:uq     r4.5<0;1,0>:uq
[TRACE]  // load_2d.f32.1x16x16(wei, h_4, b[0], (nil), (nil), (nil), (nil))
[TRACE]  (W)    send.ugm        (1|M0)  r14     r13     null:0  0x0     0x3080403               {$3,I@1}
[TRACE]  // madx16(c[0], c[0], a[0], b[0])
[TRACE]  (W)    sync.nop                null:ud                                 {$1.dst}
[TRACE]  (W)    mad     (16|M0) r30.0<1>:f      r30.0<8;1>:f    r5.0<0;0>:f     r14.0<1>:f                      {$3.dst,@7}
[TRACE]  // madx16(c[64], c[64], a[64], b[0])
[TRACE]  (W)    mad     (16|M0) r31.0<1>:f      r31.0<8;1>:f    r6.0<0;0>:f     r14.0<1>:f                      {@5}
...
[TRACE]  // madx16(c[448], c[448], a[508], b[960])
[TRACE]  (W)    mad     (16|M0) r37.0<1>:f      r37.0<8;1>:f    r12.15<0;0>:f   r29.0<1>:f                      {@7}
[TRACE]  // alloc tmp_bia_0[64] -> r5
[TRACE]  // alloc h_5[64] -> r6
[TRACE]  // h_5.u64(0) = u64(bia)
[TRACE]  (W)    mov     (1|M0)  r6.0<1>:uq      r4.6<0;1,0>:uq
[TRACE]  // load.owordx4(bia, h_5, tmp_bia_0[0], (nil), (nil), (nil), (nil))
[TRACE]  (W)    send.ugm        (1|M0)  r5      r6      null:0  0x0     0x218c780               {$1,A@1}
[TRACE]  // c.f32x16(0) = (c.f32x16(0) + tmp_bia_0.f32x16(0))
[TRACE]  (W)    add     (16|M0) r30.0<1>:f      r30.0<8;8,1>:f  r5.0<8;8,1>:f                           {$1.dst}
[TRACE]  // c.f32x16(1) = (c.f32x16(1) + tmp_bia_0.f32x16(0))
[TRACE]  (W)    add     (16|M0) r31.0<1>:f      r31.0<8;8,1>:f  r5.0<8;8,1>:f
[TRACE]  // c.f32x16(2) = (c.f32x16(2) + tmp_bia_0.f32x16(0))
[TRACE]  (W)    add     (16|M0) r32.0<1>:f      r32.0<8;8,1>:f  r5.0<8;8,1>:f
[TRACE]  // c.f32x16(3) = (c.f32x16(3) + tmp_bia_0.f32x16(0))
[TRACE]  (W)    add     (16|M0) r33.0<1>:f      r33.0<8;8,1>:f  r5.0<8;8,1>:f
[TRACE]  // c.f32x16(4) = (c.f32x16(4) + tmp_bia_0.f32x16(0))
[TRACE]  (W)    add     (16|M0) r34.0<1>:f      r34.0<8;8,1>:f  r5.0<8;8,1>:f
[TRACE]  // c.f32x16(5) = (c.f32x16(5) + tmp_bia_0.f32x16(0))
[TRACE]  (W)    add     (16|M0) r35.0<1>:f      r35.0<8;8,1>:f  r5.0<8;8,1>:f
[TRACE]  // c.f32x16(6) = (c.f32x16(6) + tmp_bia_0.f32x16(0))
[TRACE]  (W)    add     (16|M0) r36.0<1>:f      r36.0<8;8,1>:f  r5.0<8;8,1>:f
[TRACE]  // c.f32x16(7) = (c.f32x16(7) + tmp_bia_0.f32x16(0))
[TRACE]  (W)    add     (16|M0) r37.0<1>:f      r37.0<8;8,1>:f  r5.0<8;8,1>:f
[TRACE]  // alloc h_6[64] -> r5
[TRACE]  // h_6.u64(0) = u64(dst)
[TRACE]  (W)    mov     (1|M0)  r5.0<1>:uq      r4.7<0;1,0>:uq                                  {F@1}
[TRACE]  // store.owordx4(dst, h_6, c[0], true, (nil), (nil), (nil))
[TRACE]  (W)    send.ugm        (1|M0)  null    r5      r30:1   0x0     0x204c784               {$7,I@1}
[TRACE]  // alloc h_7[64] -> r5
[TRACE]  // h_7.u64(0) = (u64(dst) + 64)
[TRACE]  (W)    mov     (1|M0)  r4.0<1>:uq      r4.7<0;1,0>:uq
[TRACE]  (W)    add     (1|M0)  r5.0<1>:uq      r4.0<0;1,0>:uq  0x40:w                          {$7.dst,@1}
[TRACE]  // store.owordx16(dst, h_7, c[64], false, (nil), (nil), (nil))
[TRACE]  // alloc h_8[64] -> r5
[TRACE]  // h_8.u64(0) = (u64(dst) + 320)
[TRACE]  (W)    mov     (1|M0)  r4.0<1>:uq      r4.7<0;1,0>:uq
[TRACE]  (W)    add     (1|M0)  r5.0<1>:uq      r4.0<0;1,0>:uq  0x140:w                         {@1}
[TRACE]  // store.owordx8(dst, h_8, c[320], false, (nil), (nil), (nil))
[TRACE]  // alloc h_9[64] -> r5
[TRACE]  // h_9.u64(0) = (u64(dst) + 448)
[TRACE]  (W)    mov     (1|M0)  r4.0<1>:uq      r4.7<0;1,0>:uq
[TRACE]  (W)    add     (1|M0)  r5.0<1>:uq      r4.0<0;1,0>:uq  0x1c0:w                         {@1}
[TRACE]  // store.owordx4(dst, h_9, c[448], false, (nil), (nil), (nil))
[TRACE]  // Epilogue
[TRACE]  (W)    mov     (16|M0) r124.0<1>:ud    r0.0<8;8,1>:ud
[TRACE]  (W)    send.gtwy       (8|M0)  null    r124    null:0  0x0     0x2000010               {$31,I@1,EOT}
[TRACE]  (W)    nop
[TRACE]  (W)    nop
[TRACE]  (W)    nop
[TRACE]  (W)    nop
[TRACE]  (W)    nop
[TRACE]  (W)    nop
[TRACE]  (W)    nop
[TRACE]  (W)    nop

@rjoursler rjoursler requested a review from a team as a code owner March 7, 2025 20:33
@github-actions github-actions bot added the platform:gpu-intel Codeowner: @oneapi-src/onednn-gpu-intel label Mar 7, 2025
@rjoursler rjoursler force-pushed the rjoursle/ir_asm_gen branch 3 times, most recently from d6be064 to 3fdbcba Compare March 11, 2025 13:58
@rjoursler rjoursler force-pushed the rjoursle/ir_asm_gen branch from 92512ab to 9ea9f96 Compare March 11, 2025 18:28
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
Labels
platform:gpu-intel Codeowner: @oneapi-src/onednn-gpu-intel
Projects
None yet
Development

Successfully merging this pull request may close these issues.

2 participants