Skip to content

Commit 60d9010

Browse files
committed
AMDGPU: Fix issue in shl(or) combine
The code is doing the optimization: `((a | c1) << c2)` ==> `(a << c2) + (c1 << c2)` But this is only valid if `a` and `c1` have no common bits being set. Differential Revision: https://reviews.llvm.org/D150246
1 parent 5130e04 commit 60d9010

File tree

2 files changed

+22
-19
lines changed

2 files changed

+22
-19
lines changed

llvm/lib/Target/AMDGPU/SIISelLowering.cpp

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -9550,6 +9550,8 @@ SDValue SITargetLowering::performFCopySignCombine(SDNode *N,
95509550
}
95519551

95529552
// (shl (add x, c1), c2) -> add (shl x, c2), (shl c1, c2)
9553+
// (shl (or x, c1), c2) -> add (shl x, c2), (shl c1, c2) iff x and c1 share no
9554+
// bits
95539555

95549556
// This is a variant of
95559557
// (mul (add x, c1), c2) -> add (mul x, c2), (mul c1, c2),
@@ -9584,8 +9586,14 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
95849586
if (!CAdd)
95859587
return SDValue();
95869588

9587-
// If the resulting offset is too large, we can't fold it into the addressing
9588-
// mode offset.
9589+
SelectionDAG &DAG = DCI.DAG;
9590+
9591+
if (N0->getOpcode() == ISD::OR &&
9592+
!DAG.haveNoCommonBitsSet(N0.getOperand(0), N0.getOperand(1)))
9593+
return SDValue();
9594+
9595+
// If the resulting offset is too large, we can't fold it into the
9596+
// addressing mode offset.
95899597
APInt Offset = CAdd->getAPIntValue() << CN1->getAPIntValue();
95909598
Type *Ty = MemVT.getTypeForEVT(*DCI.DAG.getContext());
95919599

@@ -9595,7 +9603,6 @@ SDValue SITargetLowering::performSHLPtrCombine(SDNode *N,
95959603
if (!isLegalAddressingMode(DCI.DAG.getDataLayout(), AM, Ty, AddrSpace))
95969604
return SDValue();
95979605

9598-
SelectionDAG &DAG = DCI.DAG;
95999606
SDLoc SL(N);
96009607
EVT VT = N->getValueType(0);
96019608

llvm/test/CodeGen/AMDGPU/shl_add_ptr.ll

Lines changed: 12 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -410,15 +410,12 @@ define void @shl_add_ptr_combine_2use_both_max_private_offset(i16 zeroext %idx.a
410410
ret void
411411
}
412412

413-
; FIXME: This or should fold into an offset on the write
414413
; GCN-LABEL: {{^}}shl_or_ptr_combine_2use_lds:
415-
; GCN: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0
416-
; GCN: v_or_b32_e32 [[SCALE1:v[0-9]+]], 32, [[SCALE0]]
417-
; GCN: v_lshlrev_b32_e32 [[SCALE2:v[0-9]+]], 4, v0
418-
; GCN: ds_write_b32 [[SCALE1]], v{{[0-9]+}}
419-
; GCN: ds_write_b32 [[SCALE2]], v{{[0-9]+}} offset:64
414+
; GCN-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:8
415+
; GCN-DAG: ds_write_b32 v{{[0-9]+}}, v{{[0-9]+}} offset:16
420416
define void @shl_or_ptr_combine_2use_lds(i32 %idx) #0 {
421-
%idx.add = or i32 %idx, 4
417+
%idx.shl = shl i32 %idx, 1
418+
%idx.add = or i32 %idx.shl, 1
422419
%shl0 = shl i32 %idx.add, 3
423420
%shl1 = shl i32 %idx.add, 4
424421
%ptr0 = inttoptr i32 %shl0 to ptr addrspace(3)
@@ -427,15 +424,14 @@ define void @shl_or_ptr_combine_2use_lds(i32 %idx) #0 {
427424
store volatile i32 10, ptr addrspace(3) %ptr1
428425
ret void
429426
}
430-
431-
; GCN-LABEL: {{^}}shl_or_ptr_combine_2use_max_lds_offset:
432-
; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, v0
433-
; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, v0
434-
; GCN-DAG: ds_write_b32 [[SCALE0]], v{{[0-9]+}} offset:65528
435-
; GCN-DAG: v_or_b32_e32 [[ADD1:v[0-9]+]], 0x1fff0, [[SCALE1]]
436-
; GCN: ds_write_b32 [[ADD1]], v{{[0-9]+$}}
437-
define void @shl_or_ptr_combine_2use_max_lds_offset(i32 %idx) #0 {
438-
%idx.add = or i32 %idx, 8191
427+
; GCN-LABEL: {{^}}shl_or_ptr_not_combine_2use_lds:
428+
; GCN: v_or_b32_e32 [[OR:v[0-9]+]], 1, v0
429+
; GCN-DAG: v_lshlrev_b32_e32 [[SCALE0:v[0-9]+]], 3, [[OR]]
430+
; GCN-DAG: v_lshlrev_b32_e32 [[SCALE1:v[0-9]+]], 4, [[OR]]
431+
; GCN-DAG: ds_write_b32 [[SCALE0]], v{{[0-9]+}}{{$}}
432+
; GCN-DAG: ds_write_b32 [[SCALE1]], v{{[0-9]+}}{{$}}
433+
define void @shl_or_ptr_not_combine_2use_lds(i32 %idx) #0 {
434+
%idx.add = or i32 %idx, 1
439435
%shl0 = shl i32 %idx.add, 3
440436
%shl1 = shl i32 %idx.add, 4
441437
%ptr0 = inttoptr i32 %shl0 to ptr addrspace(3)

0 commit comments

Comments
 (0)