Skip to content

Commit 5dfeb5c

Browse files
committed
[X86][ISelLowering] Lower minimum[num]/maximum[num] using bitwise ops
1 parent 24b87b8 commit 5dfeb5c

File tree

6 files changed

+2129
-3499
lines changed

6 files changed

+2129
-3499
lines changed

llvm/lib/Target/X86/X86ISelLowering.cpp

Lines changed: 81 additions & 40 deletions
Original file line numberDiff line numberDiff line change
@@ -29480,7 +29480,6 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
2948029480
uint64_t SizeInBits = VT.getScalarSizeInBits();
2948129481
APInt PreferredZero = APInt::getZero(SizeInBits);
2948229482
APInt OppositeZero = PreferredZero;
29483-
EVT IVT = VT.changeTypeToInteger();
2948429483
X86ISD::NodeType MinMaxOp;
2948529484
if (IsMaxOp) {
2948629485
MinMaxOp = X86ISD::FMAX;
@@ -29492,8 +29491,8 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
2949229491
EVT SetCCType =
2949329492
TLI.getSetCCResultType(DAG.getDataLayout(), *DAG.getContext(), VT);
2949429493

29495-
// The tables below show the expected result of Max in cases of NaN and
29496-
// signed zeros.
29494+
// The tables below show the expected result of Max in cases of NaN and signed
29495+
// zeros.
2949729496
//
2949829497
// Y Y
2949929498
// Num xNaN +0 -0
@@ -29503,12 +29502,9 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
2950329502
// xNaN | X | X/Y | -0 | +0 | -0 |
2950429503
// --------------- ---------------
2950529504
//
29506-
// It is achieved by means of FMAX/FMIN with preliminary checks and operand
29507-
// reordering.
29508-
//
29509-
// We check if any of operands is NaN and return NaN. Then we check if any of
29510-
// operands is zero or negative zero (for fmaximum and fminimum respectively)
29511-
// to ensure the correct zero is returned.
29505+
// It is achieved by means of FMAX/FMIN with preliminary checks, operand
29506+
// reordering if one operand is a constant, and bitwise operations and selects
29507+
// to handle signed zero and NaN operands otherwise.
2951229508
auto MatchesZero = [](SDValue Op, APInt Zero) {
2951329509
Op = peekThroughBitcasts(Op);
2951429510
if (auto *CstOp = dyn_cast<ConstantFPSDNode>(Op))
@@ -29539,15 +29535,17 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
2953929535
Op->getFlags().hasNoSignedZeros() ||
2954029536
DAG.isKnownNeverZeroFloat(X) ||
2954129537
DAG.isKnownNeverZeroFloat(Y);
29542-
SDValue NewX, NewY;
29538+
bool ShouldHandleZeros = true;
29539+
SDValue NewX = X;
29540+
SDValue NewY = Y;
2954329541
if (IgnoreSignedZero || MatchesZero(Y, PreferredZero) ||
2954429542
MatchesZero(X, OppositeZero)) {
2954529543
// Operands are already in right order or order does not matter.
29546-
NewX = X;
29547-
NewY = Y;
29544+
ShouldHandleZeros = false;
2954829545
} else if (MatchesZero(X, PreferredZero) || MatchesZero(Y, OppositeZero)) {
2954929546
NewX = Y;
2955029547
NewY = X;
29548+
ShouldHandleZeros = false;
2955129549
} else if (!VT.isVector() && (VT == MVT::f16 || Subtarget.hasDQI()) &&
2955229550
(Op->getFlags().hasNoNaNs() || IsXNeverNaN || IsYNeverNaN)) {
2955329551
if (IsXNeverNaN)
@@ -29569,33 +29567,6 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
2956929567
NewX = DAG.getSelect(DL, VT, NeedSwap, Y, X);
2957029568
NewY = DAG.getSelect(DL, VT, NeedSwap, X, Y);
2957129569
return DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
29572-
} else {
29573-
SDValue IsXSigned;
29574-
if (Subtarget.is64Bit() || VT != MVT::f64) {
29575-
SDValue XInt = DAG.getNode(ISD::BITCAST, DL, IVT, X);
29576-
SDValue ZeroCst = DAG.getConstant(0, DL, IVT);
29577-
IsXSigned = DAG.getSetCC(DL, SetCCType, XInt, ZeroCst, ISD::SETLT);
29578-
} else {
29579-
assert(VT == MVT::f64);
29580-
SDValue Ins = DAG.getNode(ISD::INSERT_VECTOR_ELT, DL, MVT::v2f64,
29581-
DAG.getConstantFP(0, DL, MVT::v2f64), X,
29582-
DAG.getVectorIdxConstant(0, DL));
29583-
SDValue VX = DAG.getNode(ISD::BITCAST, DL, MVT::v4f32, Ins);
29584-
SDValue Hi = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, MVT::f32, VX,
29585-
DAG.getVectorIdxConstant(1, DL));
29586-
Hi = DAG.getBitcast(MVT::i32, Hi);
29587-
SDValue ZeroCst = DAG.getConstant(0, DL, MVT::i32);
29588-
EVT SetCCType = TLI.getSetCCResultType(DAG.getDataLayout(),
29589-
*DAG.getContext(), MVT::i32);
29590-
IsXSigned = DAG.getSetCC(DL, SetCCType, Hi, ZeroCst, ISD::SETLT);
29591-
}
29592-
if (MinMaxOp == X86ISD::FMAX) {
29593-
NewX = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29594-
NewY = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29595-
} else {
29596-
NewX = DAG.getSelect(DL, VT, IsXSigned, Y, X);
29597-
NewY = DAG.getSelect(DL, VT, IsXSigned, X, Y);
29598-
}
2959929570
}
2960029571

2960129572
bool IgnoreNaN = DAG.getTarget().Options.NoNaNsFPMath ||
@@ -29612,10 +29583,80 @@ static SDValue LowerFMINIMUM_FMAXIMUM(SDValue Op, const X86Subtarget &Subtarget,
2961229583

2961329584
SDValue MinMax = DAG.getNode(MinMaxOp, DL, VT, NewX, NewY, Op->getFlags());
2961429585

29586+
// We handle signed-zero ordering by taking the larger (or smaller) sign bit.
29587+
if (ShouldHandleZeros) {
29588+
const fltSemantics &Sem = VT.getFltSemantics();
29589+
unsigned EltBits = VT.getScalarSizeInBits();
29590+
bool IsFakeVector = !VT.isVector();
29591+
MVT LogicVT = VT.getSimpleVT();
29592+
if (IsFakeVector)
29593+
LogicVT = (VT == MVT::f64) ? MVT::v2f64
29594+
: (VT == MVT::f32) ? MVT::v4f32
29595+
: MVT::v8f16;
29596+
29597+
// We take the sign bit from the first operand and combine it with the
29598+
// output sign bit (see below). Right now, if ShouldHandleZeros is true, the
29599+
// operands will never have been swapped. If you add another optimization
29600+
// that swaps the input operands if one is a known value, make sure this
29601+
// logic stays correct!
29602+
SDValue LogicX = NewX;
29603+
SDValue LogicMinMax = MinMax;
29604+
if (IsFakeVector) {
29605+
// Promote scalars to vectors for bitwise operations.
29606+
LogicX = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, LogicVT, NewX);
29607+
LogicMinMax = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, LogicVT, MinMax);
29608+
}
29609+
29610+
// x86's min/max operations return the second operand if both inputs are
29611+
// signed zero. For the maximum operation, we want to "and" the sign bit of
29612+
// the output with the sign bit of the first operand--that means that if the
29613+
// first operand is +0.0, the output will be too. For the minimum, it's the
29614+
// opposite: we "or" the output sign bit with the sign bit of the first
29615+
// operand, ensuring that if the first operand is -0.0, the output will be
29616+
// too.
29617+
SDValue Result;
29618+
if (IsMaxOp) {
29619+
// getSignedMaxValue returns a bit pattern of all ones but the highest
29620+
// bit. We "or" that with the first operand, then "and" that with the max
29621+
// operation's result. That clears only the sign bit, and only if the
29622+
// first operand is positive.
29623+
SDValue OrMask = DAG.getConstantFP(
29624+
APFloat(Sem, APInt::getSignedMaxValue(EltBits)), DL, LogicVT);
29625+
SDValue MaskedSignBit =
29626+
DAG.getNode(X86ISD::FOR, DL, LogicVT, LogicX, OrMask);
29627+
Result =
29628+
DAG.getNode(X86ISD::FAND, DL, LogicVT, MaskedSignBit, LogicMinMax);
29629+
} else {
29630+
// Likewise, getSignMask returns a bit pattern with only the highest bit
29631+
// set. This one *sets* only the sign bit, and only if the first operand
29632+
// is *negative*.
29633+
SDValue AndMask = DAG.getConstantFP(
29634+
APFloat(Sem, APInt::getSignMask(EltBits)), DL, LogicVT);
29635+
SDValue MaskedSignBit =
29636+
DAG.getNode(X86ISD::FAND, DL, LogicVT, LogicX, AndMask);
29637+
Result =
29638+
DAG.getNode(X86ISD::FOR, DL, LogicVT, MaskedSignBit, LogicMinMax);
29639+
}
29640+
29641+
// Extract scalar back from vector.
29642+
if (IsFakeVector)
29643+
MinMax = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, DL, VT, Result,
29644+
DAG.getVectorIdxConstant(0, DL));
29645+
else
29646+
MinMax = Result;
29647+
}
29648+
2961529649
if (IgnoreNaN || DAG.isKnownNeverNaN(IsNum ? NewY : NewX))
2961629650
return MinMax;
2961729651

29618-
SDValue NaNSrc = IsNum ? MinMax : NewX;
29652+
// The x86 min/max return the second operand if either is NaN, which doesn't
29653+
// match the numeric or non-numeric semantics. For the non-numeric versions,
29654+
// we want to return NaN if either operand is NaN. To do that, we check if
29655+
// NewX (the first operand) is NaN, and select it if so. For the numeric
29656+
// versions, we want to return the non-NaN operand if there is one. So we
29657+
// check if NewY (the second operand) is NaN, and again select the first
29658+
// operand if so.
29659+
SDValue NaNSrc = IsNum ? NewY : NewX;
2961929660
SDValue IsNaN = DAG.getSetCC(DL, SetCCType, NaNSrc, NaNSrc, ISD::SETUO);
2962029661

2962129662
return DAG.getSelect(DL, VT, IsNaN, NewX, MinMax);

llvm/test/CodeGen/X86/avx512fp16-fminimum-fmaximum.ll

Lines changed: 16 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -13,14 +13,9 @@ declare <32 x half> @llvm.maximum.v32f16(<32 x half>, <32 x half>)
1313
define half @test_fminimum(half %x, half %y) {
1414
; CHECK-LABEL: test_fminimum:
1515
; CHECK: # %bb.0:
16-
; CHECK-NEXT: vmovw %xmm0, %eax
17-
; CHECK-NEXT: testw %ax, %ax
18-
; CHECK-NEXT: sets %al
19-
; CHECK-NEXT: kmovd %eax, %k1
20-
; CHECK-NEXT: vmovaps %xmm1, %xmm2
21-
; CHECK-NEXT: vmovsh %xmm0, %xmm0, %xmm2 {%k1}
22-
; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1}
23-
; CHECK-NEXT: vminsh %xmm2, %xmm0, %xmm1
16+
; CHECK-NEXT: vminsh %xmm1, %xmm0, %xmm2
17+
; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
18+
; CHECK-NEXT: vpternlogq {{.*#+}} xmm1 = (xmm1 & xmm0) | xmm2
2419
; CHECK-NEXT: vcmpunordsh %xmm0, %xmm0, %k1
2520
; CHECK-NEXT: vmovsh %xmm0, %xmm0, %xmm1 {%k1}
2621
; CHECK-NEXT: vmovaps %xmm1, %xmm0
@@ -92,16 +87,12 @@ define half @test_fminimum_combine_cmps(half %x, half %y) {
9287
define half @test_fmaximum(half %x, half %y) {
9388
; CHECK-LABEL: test_fmaximum:
9489
; CHECK: # %bb.0:
95-
; CHECK-NEXT: vmovw %xmm0, %eax
96-
; CHECK-NEXT: testw %ax, %ax
97-
; CHECK-NEXT: sets %al
98-
; CHECK-NEXT: kmovd %eax, %k1
99-
; CHECK-NEXT: vmovaps %xmm0, %xmm2
100-
; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm2 {%k1}
90+
; CHECK-NEXT: vmaxsh %xmm1, %xmm0, %xmm2
91+
; CHECK-NEXT: vpbroadcastw {{.*#+}} xmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
92+
; CHECK-NEXT: vpternlogq {{.*#+}} xmm1 = xmm2 & (xmm1 | xmm0)
93+
; CHECK-NEXT: vcmpunordsh %xmm0, %xmm0, %k1
10194
; CHECK-NEXT: vmovsh %xmm0, %xmm0, %xmm1 {%k1}
102-
; CHECK-NEXT: vmaxsh %xmm2, %xmm1, %xmm0
103-
; CHECK-NEXT: vcmpunordsh %xmm1, %xmm1, %k1
104-
; CHECK-NEXT: vmovsh %xmm1, %xmm0, %xmm0 {%k1}
95+
; CHECK-NEXT: vmovaps %xmm1, %xmm0
10596
; CHECK-NEXT: retq
10697
%r = call half @llvm.maximum.f16(half %x, half %y)
10798
ret half %r
@@ -196,10 +187,9 @@ define <16 x half> @test_fmaximum_v16f16_nans(<16 x half> %x, <16 x half> %y) "n
196187
define <32 x half> @test_fminimum_v32f16_szero(<32 x half> %x, <32 x half> %y) "no-nans-fp-math"="true" {
197188
; CHECK-LABEL: test_fminimum_v32f16_szero:
198189
; CHECK: # %bb.0:
199-
; CHECK-NEXT: vpmovw2m %zmm0, %k1
200-
; CHECK-NEXT: vpblendmw %zmm0, %zmm1, %zmm2 {%k1}
201-
; CHECK-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
202-
; CHECK-NEXT: vminph %zmm2, %zmm0, %zmm0
190+
; CHECK-NEXT: vminph %zmm1, %zmm0, %zmm1
191+
; CHECK-NEXT: vpbroadcastw {{.*#+}} zmm2 = [-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0,-0.0E+0]
192+
; CHECK-NEXT: vpternlogq {{.*#+}} zmm0 = (zmm0 & zmm2) | zmm1
203193
; CHECK-NEXT: retq
204194
%r = call <32 x half> @llvm.minimum.v32f16(<32 x half> %x, <32 x half> %y)
205195
ret <32 x half> %r
@@ -208,12 +198,12 @@ define <32 x half> @test_fminimum_v32f16_szero(<32 x half> %x, <32 x half> %y) "
208198
define <32 x half> @test_fmaximum_v32f16_nans_szero(<32 x half> %x, <32 x half> %y) {
209199
; CHECK-LABEL: test_fmaximum_v32f16_nans_szero:
210200
; CHECK: # %bb.0:
211-
; CHECK-NEXT: vpmovw2m %zmm0, %k1
212-
; CHECK-NEXT: vpblendmw %zmm1, %zmm0, %zmm2 {%k1}
201+
; CHECK-NEXT: vmaxph %zmm1, %zmm0, %zmm2
202+
; CHECK-NEXT: vpbroadcastw {{.*#+}} zmm1 = [NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN,NaN]
203+
; CHECK-NEXT: vpternlogq {{.*#+}} zmm1 = zmm2 & (zmm1 | zmm0)
204+
; CHECK-NEXT: vcmpunordph %zmm0, %zmm0, %k1
213205
; CHECK-NEXT: vmovdqu16 %zmm0, %zmm1 {%k1}
214-
; CHECK-NEXT: vmaxph %zmm2, %zmm1, %zmm0
215-
; CHECK-NEXT: vcmpunordph %zmm1, %zmm1, %k1
216-
; CHECK-NEXT: vmovdqu16 %zmm1, %zmm0 {%k1}
206+
; CHECK-NEXT: vmovdqa64 %zmm1, %zmm0
217207
; CHECK-NEXT: retq
218208
%r = call <32 x half> @llvm.maximum.v32f16(<32 x half> %x, <32 x half> %y)
219209
ret <32 x half> %r

0 commit comments

Comments
 (0)