@@ -557,11 +557,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
557
557
; GFX908-NEXT: s_mul_hi_u32 s9, s0, s5
558
558
; GFX908-NEXT: s_mul_i32 s0, s0, s5
559
559
; GFX908-NEXT: s_add_i32 s1, s9, s1
560
- ; GFX908-NEXT: s_lshl_b64 s[14:15 ], s[0:1], 5
560
+ ; GFX908-NEXT: s_lshl_b64 s[0:1 ], s[0:1], 5
561
561
; GFX908-NEXT: s_branch .LBB3_2
562
562
; GFX908-NEXT: .LBB3_1: ; %Flow20
563
563
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
564
- ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[0:1 ]
564
+ ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[14:15 ]
565
565
; GFX908-NEXT: s_cbranch_vccz .LBB3_12
566
566
; GFX908-NEXT: .LBB3_2: ; %bb9
567
567
; GFX908-NEXT: ; =>This Loop Header: Depth=1
@@ -571,17 +571,15 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
571
571
; GFX908-NEXT: ; %bb.3: ; %bb14
572
572
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
573
573
; GFX908-NEXT: global_load_dwordx2 v[2:3], v[0:1], off
574
- ; GFX908-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1
575
574
; GFX908-NEXT: s_mov_b32 s9, s8
576
- ; GFX908-NEXT: v_cndmask_b32_e64 v6, 0, 1, s[0:1]
577
575
; GFX908-NEXT: v_mov_b32_e32 v4, s8
578
- ; GFX908-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v6
579
576
; GFX908-NEXT: v_mov_b32_e32 v8, s8
580
577
; GFX908-NEXT: v_mov_b32_e32 v6, s8
581
578
; GFX908-NEXT: v_mov_b32_e32 v5, s9
582
579
; GFX908-NEXT: v_mov_b32_e32 v9, s9
583
580
; GFX908-NEXT: v_mov_b32_e32 v7, s9
584
- ; GFX908-NEXT: v_cmp_lt_i64_e64 s[16:17], s[6:7], 0
581
+ ; GFX908-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0
582
+ ; GFX908-NEXT: v_cmp_gt_i64_e64 s[16:17], s[6:7], -1
585
583
; GFX908-NEXT: v_mov_b32_e32 v11, v5
586
584
; GFX908-NEXT: s_mov_b64 s[20:21], s[10:11]
587
585
; GFX908-NEXT: v_mov_b32_e32 v10, v4
@@ -601,9 +599,9 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
601
599
; GFX908-NEXT: ; in Loop: Header=BB3_5 Depth=2
602
600
; GFX908-NEXT: v_add_co_u32_sdwa v2, vcc, v2, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
603
601
; GFX908-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
604
- ; GFX908-NEXT: s_add_u32 s20, s20, s14
602
+ ; GFX908-NEXT: s_add_u32 s20, s20, s0
605
603
; GFX908-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[2:3]
606
- ; GFX908-NEXT: s_addc_u32 s21, s21, s15
604
+ ; GFX908-NEXT: s_addc_u32 s21, s21, s1
607
605
; GFX908-NEXT: s_mov_b64 s[22:23], 0
608
606
; GFX908-NEXT: s_andn2_b64 vcc, exec, s[24:25]
609
607
; GFX908-NEXT: s_cbranch_vccz .LBB3_9
@@ -622,7 +620,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
622
620
; GFX908-NEXT: s_waitcnt vmcnt(0)
623
621
; GFX908-NEXT: ds_read_b64 v[12:13], v19
624
622
; GFX908-NEXT: ds_read_b64 v[14:15], v0
625
- ; GFX908-NEXT: s_and_b64 vcc, exec, s[0:1 ]
623
+ ; GFX908-NEXT: s_andn2_b64 vcc, exec, s[16:17 ]
626
624
; GFX908-NEXT: s_waitcnt lgkmcnt(0)
627
625
; GFX908-NEXT: s_cbranch_vccnz .LBB3_7
628
626
; GFX908-NEXT: ; %bb.6: ; %bb51
@@ -650,7 +648,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
650
648
; GFX908-NEXT: s_mov_b64 s[22:23], -1
651
649
; GFX908-NEXT: s_branch .LBB3_4
652
650
; GFX908-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
653
- ; GFX908-NEXT: s_mov_b64 s[22:23], s[16:17 ]
651
+ ; GFX908-NEXT: s_mov_b64 s[22:23], s[14:15 ]
654
652
; GFX908-NEXT: s_andn2_b64 vcc, exec, s[22:23]
655
653
; GFX908-NEXT: s_cbranch_vccz .LBB3_4
656
654
; GFX908-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
@@ -665,7 +663,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
665
663
; GFX908-NEXT: s_xor_b64 s[16:17], s[22:23], -1
666
664
; GFX908-NEXT: .LBB3_10: ; %Flow19
667
665
; GFX908-NEXT: ; in Loop: Header=BB3_2 Depth=1
668
- ; GFX908-NEXT: s_mov_b64 s[0:1 ], -1
666
+ ; GFX908-NEXT: s_mov_b64 s[14:15 ], -1
669
667
; GFX908-NEXT: s_and_b64 vcc, exec, s[16:17]
670
668
; GFX908-NEXT: s_cbranch_vccz .LBB3_1
671
669
; GFX908-NEXT: ; %bb.11: ; %bb12
@@ -674,7 +672,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
674
672
; GFX908-NEXT: s_addc_u32 s7, s7, 0
675
673
; GFX908-NEXT: s_add_u32 s10, s10, s12
676
674
; GFX908-NEXT: s_addc_u32 s11, s11, s13
677
- ; GFX908-NEXT: s_mov_b64 s[0:1 ], 0
675
+ ; GFX908-NEXT: s_mov_b64 s[14:15 ], 0
678
676
; GFX908-NEXT: s_branch .LBB3_1
679
677
; GFX908-NEXT: .LBB3_12: ; %DummyReturnBlock
680
678
; GFX908-NEXT: s_endpgm
@@ -724,11 +722,11 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
724
722
; GFX90A-NEXT: s_mul_hi_u32 s9, s0, s5
725
723
; GFX90A-NEXT: s_mul_i32 s0, s0, s5
726
724
; GFX90A-NEXT: s_add_i32 s1, s9, s1
727
- ; GFX90A-NEXT: s_lshl_b64 s[14:15 ], s[0:1], 5
725
+ ; GFX90A-NEXT: s_lshl_b64 s[0:1 ], s[0:1], 5
728
726
; GFX90A-NEXT: s_branch .LBB3_2
729
727
; GFX90A-NEXT: .LBB3_1: ; %Flow20
730
728
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
731
- ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[0:1 ]
729
+ ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[14:15 ]
732
730
; GFX90A-NEXT: s_cbranch_vccz .LBB3_12
733
731
; GFX90A-NEXT: .LBB3_2: ; %bb9
734
732
; GFX90A-NEXT: ; =>This Loop Header: Depth=1
@@ -738,14 +736,12 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
738
736
; GFX90A-NEXT: ; %bb.3: ; %bb14
739
737
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
740
738
; GFX90A-NEXT: global_load_dwordx2 v[4:5], v[2:3], off
741
- ; GFX90A-NEXT: v_cmp_gt_i64_e64 s[0:1], s[6:7], -1
742
739
; GFX90A-NEXT: s_mov_b32 s9, s8
743
- ; GFX90A-NEXT: v_cndmask_b32_e64 v8, 0, 1, s[0:1]
744
740
; GFX90A-NEXT: v_pk_mov_b32 v[6:7], s[8:9], s[8:9] op_sel:[0,1]
745
- ; GFX90A-NEXT: v_cmp_ne_u32_e64 s[0:1], 1, v8
746
741
; GFX90A-NEXT: v_pk_mov_b32 v[10:11], s[8:9], s[8:9] op_sel:[0,1]
747
742
; GFX90A-NEXT: v_pk_mov_b32 v[8:9], s[8:9], s[8:9] op_sel:[0,1]
748
- ; GFX90A-NEXT: v_cmp_lt_i64_e64 s[16:17], s[6:7], 0
743
+ ; GFX90A-NEXT: v_cmp_lt_i64_e64 s[14:15], s[6:7], 0
744
+ ; GFX90A-NEXT: v_cmp_gt_i64_e64 s[16:17], s[6:7], -1
749
745
; GFX90A-NEXT: s_mov_b64 s[20:21], s[10:11]
750
746
; GFX90A-NEXT: v_pk_mov_b32 v[12:13], v[6:7], v[6:7] op_sel:[0,1]
751
747
; GFX90A-NEXT: s_waitcnt vmcnt(0)
@@ -764,8 +760,8 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
764
760
; GFX90A-NEXT: ; in Loop: Header=BB3_5 Depth=2
765
761
; GFX90A-NEXT: v_add_co_u32_sdwa v4, vcc, v4, v18 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0
766
762
; GFX90A-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v5, vcc
767
- ; GFX90A-NEXT: s_add_u32 s20, s20, s14
768
- ; GFX90A-NEXT: s_addc_u32 s21, s21, s15
763
+ ; GFX90A-NEXT: s_add_u32 s20, s20, s0
764
+ ; GFX90A-NEXT: s_addc_u32 s21, s21, s1
769
765
; GFX90A-NEXT: v_cmp_lt_i64_e64 s[24:25], -1, v[4:5]
770
766
; GFX90A-NEXT: s_mov_b64 s[22:23], 0
771
767
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[24:25]
@@ -785,7 +781,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
785
781
; GFX90A-NEXT: s_waitcnt vmcnt(0)
786
782
; GFX90A-NEXT: ds_read_b64 v[14:15], v19
787
783
; GFX90A-NEXT: ds_read_b64 v[16:17], v0
788
- ; GFX90A-NEXT: s_and_b64 vcc, exec, s[0:1 ]
784
+ ; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[16:17 ]
789
785
; GFX90A-NEXT: ; kill: killed $sgpr22 killed $sgpr23
790
786
; GFX90A-NEXT: s_waitcnt lgkmcnt(0)
791
787
; GFX90A-NEXT: s_cbranch_vccnz .LBB3_7
@@ -806,7 +802,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
806
802
; GFX90A-NEXT: s_mov_b64 s[22:23], -1
807
803
; GFX90A-NEXT: s_branch .LBB3_4
808
804
; GFX90A-NEXT: .LBB3_7: ; in Loop: Header=BB3_5 Depth=2
809
- ; GFX90A-NEXT: s_mov_b64 s[22:23], s[16:17 ]
805
+ ; GFX90A-NEXT: s_mov_b64 s[22:23], s[14:15 ]
810
806
; GFX90A-NEXT: s_andn2_b64 vcc, exec, s[22:23]
811
807
; GFX90A-NEXT: s_cbranch_vccz .LBB3_4
812
808
; GFX90A-NEXT: ; %bb.8: ; in Loop: Header=BB3_2 Depth=1
@@ -821,7 +817,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
821
817
; GFX90A-NEXT: s_xor_b64 s[16:17], s[22:23], -1
822
818
; GFX90A-NEXT: .LBB3_10: ; %Flow19
823
819
; GFX90A-NEXT: ; in Loop: Header=BB3_2 Depth=1
824
- ; GFX90A-NEXT: s_mov_b64 s[0:1 ], -1
820
+ ; GFX90A-NEXT: s_mov_b64 s[14:15 ], -1
825
821
; GFX90A-NEXT: s_and_b64 vcc, exec, s[16:17]
826
822
; GFX90A-NEXT: s_cbranch_vccz .LBB3_1
827
823
; GFX90A-NEXT: ; %bb.11: ; %bb12
@@ -830,7 +826,7 @@ define amdgpu_kernel void @introduced_copy_to_sgpr(i64 %arg, i32 %arg1, i32 %arg
830
826
; GFX90A-NEXT: s_addc_u32 s7, s7, 0
831
827
; GFX90A-NEXT: s_add_u32 s10, s10, s12
832
828
; GFX90A-NEXT: s_addc_u32 s11, s11, s13
833
- ; GFX90A-NEXT: s_mov_b64 s[0:1 ], 0
829
+ ; GFX90A-NEXT: s_mov_b64 s[14:15 ], 0
834
830
; GFX90A-NEXT: s_branch .LBB3_1
835
831
; GFX90A-NEXT: .LBB3_12: ; %DummyReturnBlock
836
832
; GFX90A-NEXT: s_endpgm
0 commit comments