Just a note on the content of this article:
I am still learning about AMD IL so this could be caused by my own mistakes when getting the IL code!*
So, recently I have been looking at AMD ISA and AMD IL code generated from HLSL shaders. During my last poke around in the generated AMD IL code I noticed that the code is generated in 64 instruction chunks so if your shader converts to 63 instructions then the AMD IL code will be 64 instructions long with the last instruction being " v_cndmask_b32 v0, s0, v0, vcc " but if you go over this and have a 65 instruction shader then the generated code will be 128 instructions long, effectively double the amount of instructions to gain one more.
Here is an example:
float4 psMain(PS_INPUT input) : SV_TARGET { float IC = 1.0f; IC += input.pos.x; IC += input.pos.y; IC += input.pos.z; IC += input.tex.x; IC += input.tex.y; IC += input.tex.z; //Code that will tip us over the 64 instruction block. //IC += input.tex.w; return float4(IC,IC ,IC ,IC ); }
shader psMain v_cndmask_b32 v0, s9, v0, vcc // 00000000: 00000009 v_cndmask_b32 v0, s0, v129, vcc // 00000004: 00010200 v_cndmask_b32 v0, v93, v128, vcc // 00000008: 0001015D v_cndmask_b32 v64, exec_lo, v0, vcc // 0000000C: 0080007E v_cndmask_b32 v48, s0, v128, vcc // 00000010: 00610000 v_cndmask_b32 v0, s21, v0, vcc // 00000014: 00000015 v_cndmask_b32 v35, exec_lo, v0, vcc // 00000018: 0046007E v_cndmask_b32 v48, s1, v128, vcc // 0000001C: 00610001 v_cndmask_b32 v0, s21, v0, vcc // 00000020: 00000015 v_cndmask_b32 v3, s125, v0, vcc // 00000024: 0006007D v_cndmask_b32 v17, s0, v0, vcc // 00000028: 00220000 v_cndmask_b32 v0, s3, v0, vcc // 0000002C: 00000003 v_cndmask_b32 v34, s0, v0, vcc // 00000030: 00440000 v_cndmask_b32 v0, s1, v0, vcc // 00000034: 00000001 v_cndmask_b32 v48, s0, v128, vcc // 00000038: 00610000 v_cndmask_b32 v0, s0, v0, vcc // 0000003C: 00000000 v_cndmask_b32 v48, s0, v128, vcc // 00000040: 00610000 v_cndmask_b32 v0, v17, v8, vcc // 00000044: 00001111 v_cndmask_b32 v0, s3, v0, vcc // 00000048: 00000003 v_cndmask_b32 v34, s0, v0, vcc // 0000004C: 00440000 v_cndmask_b32 v0, s1, v0, vcc // 00000050: 00000001 v_cndmask_b32 v34, s0, v0, vcc // 00000054: 00440000 v_cndmask_b32 v0, s0, v0, vcc // 00000058: 00000000 v_cndmask_b32 v48, s0, v128, vcc // 0000005C: 00610000 v_cndmask_b32 v0, s34, v17, vcc // 00000060: 00002222 v_cndmask_b32 v0, s3, v0, vcc // 00000064: 00000003 v_cndmask_b32 v34, s0, v0, vcc // 00000068: 00440000 v_cndmask_b32 v0, s1, v0, vcc // 0000006C: 00000001 v_cndmask_b32 v34, s0, v0, vcc // 00000070: 00440000 v_cndmask_b32 v0, s0, v0, vcc // 00000074: 00000000 v_cndmask_b32 v48, s1, v128, vcc // 00000078: 00610001 v_cndmask_b32 v0, s0, v0, vcc // 0000007C: 00000000 v_cndmask_b32 v0, s3, v0, vcc // 00000080: 00000003 v_cndmask_b32 v34, s0, v0, vcc // 00000084: 00440000 v_cndmask_b32 v0, s1, v0, vcc // 00000088: 00000001 v_cndmask_b32 v34, s0, v0, vcc // 0000008C: 00440000 v_cndmask_b32 v0, s0, v0, vcc // 00000090: 00000000 v_cndmask_b32 v48, s1, v128, vcc // 00000094: 00610001 v_cndmask_b32 v0, v17, v8, vcc // 00000098: 00001111 v_cndmask_b32 v0, s3, v0, vcc // 0000009C: 00000003 v_cndmask_b32 v34, s0, v0, vcc // 000000A0: 00440000 v_cndmask_b32 v0, s1, v0, vcc // 000000A4: 00000001 v_cndmask_b32 v34, s0, v0, vcc // 000000A8: 00440000 v_cndmask_b32 v0, s0, v0, vcc // 000000AC: 00000000 v_cndmask_b32 v48, s1, v128, vcc // 000000B0: 00610001 v_cndmask_b32 v0, s34, v17, vcc // 000000B4: 00002222 v_cndmask_b32 v0, ttmp9, v0, vcc // 000000B8: 00000079 v_cndmask_b32 v16, s0, v0, vcc // 000000BC: 00200000 v_mac_f32 v192, s0, v0 // 000000C0: 3F800000 v_mac_f32 v192, s0, v0 // 000000C4: 3F800000 v_mac_f32 v192, s0, v0 // 000000C8: 3F800000 v_mac_f32 v192, s0, v0 // 000000CC: 3F800000 v_cndmask_b32 v0, s3, v0, vcc // 000000D0: 00000003 v_cndmask_b32 v2, s0, v8, vcc // 000000D4: 00041000 v_cndmask_b32 v34, s0, v0, vcc // 000000D8: 00440000 v_cndmask_b32 v0, s0, v0, vcc // 000000DC: 00000000 v_cndmask_b32 v16, s0, v0, vcc // 000000E0: 00200000 v_cndmask_b32 v0, s71, v0, vcc // 000000E4: 00000047 v_cndmask_b32 v49, s0, v0, vcc // 000000E8: 00620000 v_cndmask_b32 v0, s85, v0, vcc // 000000EC: 00000055 v_cndmask_b32 v34, s0, v8, vcc // 000000F0: 00441000 v_cndmask_b32 v0, s16, v25, vcc // 000000F4: 00003210 v_cndmask_b32 v0, ttmp3, v0, vcc // 000000F8: 00000073 v_cndmask_b32 v0, s40, v0, vcc // 000000FC: 00000028 end
Above we can see the generated code which fits exactly within the 64 instruction limit. And then if we uncomment the last addition this happens:
float4 psMain(PS_INPUT input) : SV_TARGET { float IC = 1.0f; IC += input.pos.x; IC += input.pos.y; IC += input.pos.z; IC += input.tex.x; IC += input.tex.y; IC += input.tex.z; //Code that will tip us over the 64 instruction block. IC += input.tex.w; return float4(IC,IC ,IC ,IC ); }
shader psMain v_cndmask_b32 v0, s9, v0, vcc // 00000000: 00000009 v_cndmask_b32 v0, s0, v129, vcc // 00000004: 00010200 v_cndmask_b32 v0, v93, v128, vcc // 00000008: 0001015D v_cndmask_b32 v64, exec_lo, v0, vcc // 0000000C: 0080007E v_cndmask_b32 v48, s0, v128, vcc // 00000010: 00610000 v_cndmask_b32 v0, s21, v0, vcc // 00000014: 00000015 v_cndmask_b32 v35, exec_lo, v0, vcc // 00000018: 0046007E v_cndmask_b32 v16, s1, v128, vcc // 0000001C: 00210001 v_cndmask_b32 v3, s125, v0, vcc // 00000020: 0006007D v_cndmask_b32 v17, s0, v0, vcc // 00000024: 00220000 v_cndmask_b32 v0, s3, v0, vcc // 00000028: 00000003 v_cndmask_b32 v34, s0, v0, vcc // 0000002C: 00440000 v_cndmask_b32 v0, s1, v0, vcc // 00000030: 00000001 v_cndmask_b32 v48, s0, v128, vcc // 00000034: 00610000 v_cndmask_b32 v0, s0, v0, vcc // 00000038: 00000000 v_cndmask_b32 v48, s0, v128, vcc // 0000003C: 00610000 v_cndmask_b32 v0, v17, v8, vcc // 00000040: 00001111 v_cndmask_b32 v0, s3, v0, vcc // 00000044: 00000003 v_cndmask_b32 v34, s0, v0, vcc // 00000048: 00440000 v_cndmask_b32 v0, s1, v0, vcc // 0000004C: 00000001 v_cndmask_b32 v34, s0, v0, vcc // 00000050: 00440000 v_cndmask_b32 v0, s0, v0, vcc // 00000054: 00000000 v_cndmask_b32 v48, s0, v128, vcc // 00000058: 00610000 v_cndmask_b32 v0, s34, v17, vcc // 0000005C: 00002222 v_cndmask_b32 v0, s3, v0, vcc // 00000060: 00000003 v_cndmask_b32 v34, s0, v0, vcc // 00000064: 00440000 v_cndmask_b32 v0, s1, v0, vcc // 00000068: 00000001 v_cndmask_b32 v34, s0, v0, vcc // 0000006C: 00440000 v_cndmask_b32 v0, s0, v0, vcc // 00000070: 00000000 v_cndmask_b32 v48, s1, v128, vcc // 00000074: 00610001 v_cndmask_b32 v0, s0, v0, vcc // 00000078: 00000000 v_cndmask_b32 v0, s3, v0, vcc // 0000007C: 00000003 v_cndmask_b32 v34, s0, v0, vcc // 00000080: 00440000 v_cndmask_b32 v0, s1, v0, vcc // 00000084: 00000001 v_cndmask_b32 v34, s0, v0, vcc // 00000088: 00440000 v_cndmask_b32 v0, s0, v0, vcc // 0000008C: 00000000 v_cndmask_b32 v48, s1, v128, vcc // 00000090: 00610001 v_cndmask_b32 v0, v17, v8, vcc // 00000094: 00001111 v_cndmask_b32 v0, s3, v0, vcc // 00000098: 00000003 v_cndmask_b32 v34, s0, v0, vcc // 0000009C: 00440000 v_cndmask_b32 v0, s1, v0, vcc // 000000A0: 00000001 v_cndmask_b32 v34, s0, v0, vcc // 000000A4: 00440000 v_cndmask_b32 v0, s0, v0, vcc // 000000A8: 00000000 v_cndmask_b32 v48, s1, v128, vcc // 000000AC: 00610001 v_cndmask_b32 v0, s34, v17, vcc // 000000B0: 00002222 v_cndmask_b32 v0, s3, v0, vcc // 000000B4: 00000003 v_cndmask_b32 v34, s0, v0, vcc // 000000B8: 00440000 v_cndmask_b32 v0, s1, v0, vcc // 000000BC: 00000001 v_cndmask_b32 v34, s0, v0, vcc // 000000C0: 00440000 v_cndmask_b32 v0, s0, v0, vcc // 000000C4: 00000000 v_cndmask_b32 v48, s1, v128, vcc // 000000C8: 00610001 v_cndmask_b32 v0, v51, v25, vcc // 000000CC: 00003333 v_cndmask_b32 v0, ttmp9, v0, vcc // 000000D0: 00000079 v_cndmask_b32 v16, s0, v0, vcc // 000000D4: 00200000 v_mac_f32 v192, s0, v0 // 000000D8: 3F800000 v_mac_f32 v192, s0, v0 // 000000DC: 3F800000 v_mac_f32 v192, s0, v0 // 000000E0: 3F800000 v_mac_f32 v192, s0, v0 // 000000E4: 3F800000 v_cndmask_b32 v0, s3, v0, vcc // 000000E8: 00000003 v_cndmask_b32 v2, s0, v8, vcc // 000000EC: 00041000 v_cndmask_b32 v34, s0, v0, vcc // 000000F0: 00440000 v_cndmask_b32 v0, s0, v0, vcc // 000000F4: 00000000 v_cndmask_b32 v16, s0, v0, vcc // 000000F8: 00200000 v_cndmask_b32 v0, s71, v0, vcc // 000000FC: 00000047 v_cndmask_b32 v49, s0, v0, vcc // 00000100: 00620000 v_cndmask_b32 v0, s85, v0, vcc // 00000104: 00000055 v_cndmask_b32 v34, s0, v8, vcc // 00000108: 00441000 v_cndmask_b32 v0, s16, v25, vcc // 0000010C: 00003210 v_cndmask_b32 v0, ttmp3, v0, vcc // 00000110: 00000073 v_cndmask_b32 v0, s40, v0, vcc // 00000114: 00000028 v_cndmask_b32 v0, s0, v0, vcc // 00000118: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 0000011C: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 00000120: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 00000124: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 00000128: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 0000012C: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 00000130: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 00000134: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 00000138: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 0000013C: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 00000140: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 00000144: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 00000148: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 0000014C: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 00000150: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 00000154: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 00000158: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 0000015C: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 00000160: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 00000164: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 00000168: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 0000016C: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 00000170: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 00000174: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 00000178: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 0000017C: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 00000180: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 00000184: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 00000188: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 0000018C: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 00000190: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 00000194: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 00000198: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 0000019C: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 000001A0: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 000001A4: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 000001A8: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 000001AC: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 000001B0: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 000001B4: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 000001B8: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 000001BC: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 000001C0: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 000001C4: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 000001C8: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 000001CC: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 000001D0: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 000001D4: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 000001D8: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 000001DC: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 000001E0: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 000001E4: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 000001E8: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 000001EC: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 000001F0: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 000001F4: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 000001F8: 00000000 v_cndmask_b32 v0, s0, v0, vcc // 000001FC: 00000000 end
I am not sure how this is handled on the device, whether it runs the empty instructions or if that is just what is loaded into memory and when it executes it actually drops out early but it does look a little strange!
I am going to continue my research into how a shader actually runs on the device and will update this if I find out more!