AMD IL Empty Instruction Counts

Just a note on the content of this article:
I am still learning about AMD IL so this could be caused by my own mistakes when getting the IL code!*

So, recently I have been looking at AMD ISA and AMD IL code generated from HLSL shaders. During my last poke around in the generated AMD IL code I noticed that the code is generated  in 64 instruction chunks so if your shader converts to 63 instructions then the AMD IL code will be 64 instructions long with the last instruction being " v_cndmask_b32  v0, s0, v0, vcc " but if you go over this and have a 65 instruction shader then the generated code will be 128 instructions long, effectively double the amount of instructions to gain one more.

Here is an example:

HLSL
float4 psMain(PS_INPUT input) : SV_TARGET
{
          float IC = 1.0f;
          IC += input.pos.x;
          IC += input.pos.y;
          IC += input.pos.z;
          IC += input.tex.x;
          IC += input.tex.y;
          IC += input.tex.z;

//Code that will tip us over the 64 instruction block.          
//IC += input.tex.w;

	return float4(IC,IC ,IC ,IC );
}
AMD IL
shader psMain

  v_cndmask_b32  v0, s9, v0, vcc                            // 00000000: 00000009
  v_cndmask_b32  v0, s0, v129, vcc                          // 00000004: 00010200
  v_cndmask_b32  v0, v93, v128, vcc                         // 00000008: 0001015D
  v_cndmask_b32  v64, exec_lo, v0, vcc                      // 0000000C: 0080007E
  v_cndmask_b32  v48, s0, v128, vcc                         // 00000010: 00610000
  v_cndmask_b32  v0, s21, v0, vcc                           // 00000014: 00000015
  v_cndmask_b32  v35, exec_lo, v0, vcc                      // 00000018: 0046007E
  v_cndmask_b32  v48, s1, v128, vcc                         // 0000001C: 00610001
  v_cndmask_b32  v0, s21, v0, vcc                           // 00000020: 00000015
  v_cndmask_b32  v3, s125, v0, vcc                          // 00000024: 0006007D
  v_cndmask_b32  v17, s0, v0, vcc                           // 00000028: 00220000
  v_cndmask_b32  v0, s3, v0, vcc                            // 0000002C: 00000003
  v_cndmask_b32  v34, s0, v0, vcc                           // 00000030: 00440000
  v_cndmask_b32  v0, s1, v0, vcc                            // 00000034: 00000001
  v_cndmask_b32  v48, s0, v128, vcc                         // 00000038: 00610000
  v_cndmask_b32  v0, s0, v0, vcc                            // 0000003C: 00000000
  v_cndmask_b32  v48, s0, v128, vcc                         // 00000040: 00610000
  v_cndmask_b32  v0, v17, v8, vcc                           // 00000044: 00001111
  v_cndmask_b32  v0, s3, v0, vcc                            // 00000048: 00000003
  v_cndmask_b32  v34, s0, v0, vcc                           // 0000004C: 00440000
  v_cndmask_b32  v0, s1, v0, vcc                            // 00000050: 00000001
  v_cndmask_b32  v34, s0, v0, vcc                           // 00000054: 00440000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000058: 00000000
  v_cndmask_b32  v48, s0, v128, vcc                         // 0000005C: 00610000
  v_cndmask_b32  v0, s34, v17, vcc                          // 00000060: 00002222
  v_cndmask_b32  v0, s3, v0, vcc                            // 00000064: 00000003
  v_cndmask_b32  v34, s0, v0, vcc                           // 00000068: 00440000
  v_cndmask_b32  v0, s1, v0, vcc                            // 0000006C: 00000001
  v_cndmask_b32  v34, s0, v0, vcc                           // 00000070: 00440000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000074: 00000000
  v_cndmask_b32  v48, s1, v128, vcc                         // 00000078: 00610001
  v_cndmask_b32  v0, s0, v0, vcc                            // 0000007C: 00000000
  v_cndmask_b32  v0, s3, v0, vcc                            // 00000080: 00000003
  v_cndmask_b32  v34, s0, v0, vcc                           // 00000084: 00440000
  v_cndmask_b32  v0, s1, v0, vcc                            // 00000088: 00000001
  v_cndmask_b32  v34, s0, v0, vcc                           // 0000008C: 00440000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000090: 00000000
  v_cndmask_b32  v48, s1, v128, vcc                         // 00000094: 00610001
  v_cndmask_b32  v0, v17, v8, vcc                           // 00000098: 00001111
  v_cndmask_b32  v0, s3, v0, vcc                            // 0000009C: 00000003
  v_cndmask_b32  v34, s0, v0, vcc                           // 000000A0: 00440000
  v_cndmask_b32  v0, s1, v0, vcc                            // 000000A4: 00000001
  v_cndmask_b32  v34, s0, v0, vcc                           // 000000A8: 00440000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000000AC: 00000000
  v_cndmask_b32  v48, s1, v128, vcc                         // 000000B0: 00610001
  v_cndmask_b32  v0, s34, v17, vcc                          // 000000B4: 00002222
  v_cndmask_b32  v0, ttmp9, v0, vcc                         // 000000B8: 00000079
  v_cndmask_b32  v16, s0, v0, vcc                           // 000000BC: 00200000
  v_mac_f32     v192, s0, v0                                // 000000C0: 3F800000
  v_mac_f32     v192, s0, v0                                // 000000C4: 3F800000
  v_mac_f32     v192, s0, v0                                // 000000C8: 3F800000
  v_mac_f32     v192, s0, v0                                // 000000CC: 3F800000
  v_cndmask_b32  v0, s3, v0, vcc                            // 000000D0: 00000003
  v_cndmask_b32  v2, s0, v8, vcc                            // 000000D4: 00041000
  v_cndmask_b32  v34, s0, v0, vcc                           // 000000D8: 00440000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000000DC: 00000000
  v_cndmask_b32  v16, s0, v0, vcc                           // 000000E0: 00200000
  v_cndmask_b32  v0, s71, v0, vcc                           // 000000E4: 00000047
  v_cndmask_b32  v49, s0, v0, vcc                           // 000000E8: 00620000
  v_cndmask_b32  v0, s85, v0, vcc                           // 000000EC: 00000055
  v_cndmask_b32  v34, s0, v8, vcc                           // 000000F0: 00441000
  v_cndmask_b32  v0, s16, v25, vcc                          // 000000F4: 00003210
  v_cndmask_b32  v0, ttmp3, v0, vcc                         // 000000F8: 00000073
  v_cndmask_b32  v0, s40, v0, vcc                           // 000000FC: 00000028
end

Above we can see the generated code which fits exactly within the 64 instruction limit. And then if we uncomment the last addition this happens:

HLSL
float4 psMain(PS_INPUT input) : SV_TARGET
{
          float IC = 1.0f;
          IC += input.pos.x;
          IC += input.pos.y;
          IC += input.pos.z;
          IC += input.tex.x;
          IC += input.tex.y;
          IC += input.tex.z;

//Code that will tip us over the 64 instruction block.          
IC += input.tex.w;

	return float4(IC,IC ,IC ,IC );
}
AMD IL
shader psMain

  v_cndmask_b32  v0, s9, v0, vcc                            // 00000000: 00000009
  v_cndmask_b32  v0, s0, v129, vcc                          // 00000004: 00010200
  v_cndmask_b32  v0, v93, v128, vcc                         // 00000008: 0001015D
  v_cndmask_b32  v64, exec_lo, v0, vcc                      // 0000000C: 0080007E
  v_cndmask_b32  v48, s0, v128, vcc                         // 00000010: 00610000
  v_cndmask_b32  v0, s21, v0, vcc                           // 00000014: 00000015
  v_cndmask_b32  v35, exec_lo, v0, vcc                      // 00000018: 0046007E
  v_cndmask_b32  v16, s1, v128, vcc                         // 0000001C: 00210001
  v_cndmask_b32  v3, s125, v0, vcc                          // 00000020: 0006007D
  v_cndmask_b32  v17, s0, v0, vcc                           // 00000024: 00220000
  v_cndmask_b32  v0, s3, v0, vcc                            // 00000028: 00000003
  v_cndmask_b32  v34, s0, v0, vcc                           // 0000002C: 00440000
  v_cndmask_b32  v0, s1, v0, vcc                            // 00000030: 00000001
  v_cndmask_b32  v48, s0, v128, vcc                         // 00000034: 00610000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000038: 00000000
  v_cndmask_b32  v48, s0, v128, vcc                         // 0000003C: 00610000
  v_cndmask_b32  v0, v17, v8, vcc                           // 00000040: 00001111
  v_cndmask_b32  v0, s3, v0, vcc                            // 00000044: 00000003
  v_cndmask_b32  v34, s0, v0, vcc                           // 00000048: 00440000
  v_cndmask_b32  v0, s1, v0, vcc                            // 0000004C: 00000001
  v_cndmask_b32  v34, s0, v0, vcc                           // 00000050: 00440000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000054: 00000000
  v_cndmask_b32  v48, s0, v128, vcc                         // 00000058: 00610000
  v_cndmask_b32  v0, s34, v17, vcc                          // 0000005C: 00002222
  v_cndmask_b32  v0, s3, v0, vcc                            // 00000060: 00000003
  v_cndmask_b32  v34, s0, v0, vcc                           // 00000064: 00440000
  v_cndmask_b32  v0, s1, v0, vcc                            // 00000068: 00000001
  v_cndmask_b32  v34, s0, v0, vcc                           // 0000006C: 00440000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000070: 00000000
  v_cndmask_b32  v48, s1, v128, vcc                         // 00000074: 00610001
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000078: 00000000
  v_cndmask_b32  v0, s3, v0, vcc                            // 0000007C: 00000003
  v_cndmask_b32  v34, s0, v0, vcc                           // 00000080: 00440000
  v_cndmask_b32  v0, s1, v0, vcc                            // 00000084: 00000001
  v_cndmask_b32  v34, s0, v0, vcc                           // 00000088: 00440000
  v_cndmask_b32  v0, s0, v0, vcc                            // 0000008C: 00000000
  v_cndmask_b32  v48, s1, v128, vcc                         // 00000090: 00610001
  v_cndmask_b32  v0, v17, v8, vcc                           // 00000094: 00001111
  v_cndmask_b32  v0, s3, v0, vcc                            // 00000098: 00000003
  v_cndmask_b32  v34, s0, v0, vcc                           // 0000009C: 00440000
  v_cndmask_b32  v0, s1, v0, vcc                            // 000000A0: 00000001
  v_cndmask_b32  v34, s0, v0, vcc                           // 000000A4: 00440000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000000A8: 00000000
  v_cndmask_b32  v48, s1, v128, vcc                         // 000000AC: 00610001
  v_cndmask_b32  v0, s34, v17, vcc                          // 000000B0: 00002222
  v_cndmask_b32  v0, s3, v0, vcc                            // 000000B4: 00000003
  v_cndmask_b32  v34, s0, v0, vcc                           // 000000B8: 00440000
  v_cndmask_b32  v0, s1, v0, vcc                            // 000000BC: 00000001
  v_cndmask_b32  v34, s0, v0, vcc                           // 000000C0: 00440000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000000C4: 00000000
  v_cndmask_b32  v48, s1, v128, vcc                         // 000000C8: 00610001
  v_cndmask_b32  v0, v51, v25, vcc                          // 000000CC: 00003333
  v_cndmask_b32  v0, ttmp9, v0, vcc                         // 000000D0: 00000079
  v_cndmask_b32  v16, s0, v0, vcc                           // 000000D4: 00200000
  v_mac_f32     v192, s0, v0                                // 000000D8: 3F800000
  v_mac_f32     v192, s0, v0                                // 000000DC: 3F800000
  v_mac_f32     v192, s0, v0                                // 000000E0: 3F800000
  v_mac_f32     v192, s0, v0                                // 000000E4: 3F800000
  v_cndmask_b32  v0, s3, v0, vcc                            // 000000E8: 00000003
  v_cndmask_b32  v2, s0, v8, vcc                            // 000000EC: 00041000
  v_cndmask_b32  v34, s0, v0, vcc                           // 000000F0: 00440000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000000F4: 00000000
  v_cndmask_b32  v16, s0, v0, vcc                           // 000000F8: 00200000
  v_cndmask_b32  v0, s71, v0, vcc                           // 000000FC: 00000047
  v_cndmask_b32  v49, s0, v0, vcc                           // 00000100: 00620000
  v_cndmask_b32  v0, s85, v0, vcc                           // 00000104: 00000055
  v_cndmask_b32  v34, s0, v8, vcc                           // 00000108: 00441000
  v_cndmask_b32  v0, s16, v25, vcc                          // 0000010C: 00003210
  v_cndmask_b32  v0, ttmp3, v0, vcc                         // 00000110: 00000073
  v_cndmask_b32  v0, s40, v0, vcc                           // 00000114: 00000028
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000118: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 0000011C: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000120: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000124: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000128: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 0000012C: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000130: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000134: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000138: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 0000013C: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000140: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000144: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000148: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 0000014C: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000150: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000154: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000158: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 0000015C: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000160: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000164: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000168: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 0000016C: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000170: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000174: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000178: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 0000017C: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000180: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000184: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000188: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 0000018C: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000190: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000194: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 00000198: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 0000019C: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000001A0: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000001A4: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000001A8: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000001AC: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000001B0: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000001B4: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000001B8: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000001BC: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000001C0: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000001C4: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000001C8: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000001CC: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000001D0: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000001D4: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000001D8: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000001DC: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000001E0: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000001E4: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000001E8: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000001EC: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000001F0: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000001F4: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000001F8: 00000000
  v_cndmask_b32  v0, s0, v0, vcc                            // 000001FC: 00000000
end

I am not sure how this is handled on the device, whether it runs the empty instructions or if that is just what is loaded into memory and when it executes it actually drops out early but it does look a little strange!

I am going to continue my research into how a shader actually runs on the device and will update this if I find out more!