- Notifications
You must be signed in to change notification settings - Fork 15.4k
[LoongArch] Support memcmp expansion for vectors and combine for i128/i256 setcc #167828
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: users/zhaoqi5/tests-memcmp-expansion-vec
Are you sure you want to change the base?
[LoongArch] Support memcmp expansion for vectors and combine for i128/i256 setcc #167828
Conversation
…/i256 setcc This commit enables memcmp expansion for lsx/lasx. After doing this, i128 and i256 loads which are illegal types on LoongArch will be generated. Without process, they will be splited to legal scalar type. So this commit also enable combination for `setcc` to bitcast i128/i256 types to vector types before type legalization and generate vector instructions. Inspired by x86 and riscv.
| @llvm/pr-subscribers-backend-loongarch Author: ZhaoQi (zhaoqi5) ChangesThis commit enables memcmp expansion for lsx/lasx. After doing So this commit also enable combination for Inspired by x86 and riscv. Patch is 83.62 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/167828.diff 4 Files Affected:
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index cf4ffc82f6009..d7d820e4505fd 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -5733,11 +5733,8 @@ static bool checkValueWidth(SDValue V, ISD::LoadExtType &ExtType) { // +-------------+ // | CMP | // +-------------+ -static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const LoongArchSubtarget &Subtarget) { - ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); - +static SDValue combineTruncZExtAndSetcc(SDNode *N, SelectionDAG &DAG, EVT VT, + ISD::CondCode CC, const SDLoc &DL) { SDNode *AndNode = N->getOperand(0).getNode(); if (AndNode->getOpcode() != ISD::AND) return SDValue(); @@ -5793,14 +5790,123 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); // These truncation and zero-extension nodes are not necessary, remove them. - SDValue NewAnd = DAG.getNode(ISD::AND, SDLoc(N), AndNode->getValueType(0), + SDValue NewAnd = DAG.getNode(ISD::AND, DL, AndNode->getValueType(0), TruncInputValue1, TruncInputValue2); - SDValue NewSetCC = - DAG.getSetCC(SDLoc(N), N->getValueType(0), NewAnd, TruncInputValue2, CC); + SDValue NewSetCC = DAG.getSetCC(DL, VT, NewAnd, TruncInputValue2, CC); DAG.ReplaceAllUsesWith(N, NewSetCC.getNode()); return SDValue(N, 0); } +/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a +/// recognizable memcmp expansion. +static bool isOrXorXorTree(SDValue X, bool Root = true) { + if (X.getOpcode() == ISD::OR) + return isOrXorXorTree(X.getOperand(0), false) && + isOrXorXorTree(X.getOperand(1), false); + if (Root) + return false; + return X.getOpcode() == ISD::XOR; +} + +/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp +/// expansion. +static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, + EVT VecVT) { + SDValue Op0 = X.getOperand(0); + SDValue Op1 = X.getOperand(1); + if (X.getOpcode() == ISD::OR) { + SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT); + SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT); + return DAG.getNode(ISD::OR, DL, VecVT, A, B); + } + if (X.getOpcode() == ISD::XOR) { + SDValue A = DAG.getBitcast(VecVT, Op0); + SDValue B = DAG.getBitcast(VecVT, Op1); + return DAG.getNode(ISD::XOR, DL, VecVT, A, B); + } + llvm_unreachable("Impossible"); +} + +/// Try to map a 128-bit or 256-bit integer comparison to vector instructions +/// before type legalization splits it up into chunks. +static SDValue +combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, + const SDLoc &DL, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { + assert(isIntEqualitySetCC(CC) && "Bad comparison predicate"); + + EVT OpVT = X.getValueType(); + unsigned OpSize = OpVT.getSizeInBits(); + MVT GRLenVT = Subtarget.getGRLenVT(); + + // We're looking for an oversized integer equality comparison. + if (!OpVT.isScalarInteger()) + return SDValue(); + + if (!(OpSize == 128 && Subtarget.hasExtLSX()) && + !(OpSize == 256 && Subtarget.hasExtLASX())) + return SDValue(); + + if (DAG.getMachineFunction().getFunction().hasFnAttribute( + Attribute::NoImplicitFloat)) + return SDValue(); + + // Check if this is a bitwise-combined equality comparison of 2 pairs of + // vectors: + // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne + bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X); + + // Don't perform this combine if constructing the vector will be expensive. + auto IsVectorBitCastCheap = [](SDValue X) { + X = peekThroughBitcasts(X); + return isa<ConstantSDNode>(X) || X.getValueType().isVector() || + X.getOpcode() == ISD::LOAD; + }; + + if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) && + !IsOrXorXorTreeCCZero) + return SDValue(); + + // Treat as v2i64/v4i64 on LA64 and v4i32/v8i32 on LA32. + unsigned VecSize = OpSize / (Subtarget.is64Bit() ? 64 : 32); + EVT VecVT = + MVT::getVectorVT(Subtarget.is64Bit() ? MVT::i64 : MVT::i32, VecSize); + + SDValue Cmp; + if (IsOrXorXorTreeCCZero) { + Cmp = emitOrXorXorTree(X, DL, DAG, VecVT); + } else { + SDValue VecX = DAG.getBitcast(VecVT, X); + SDValue VecY = DAG.getBitcast(VecVT, Y); + Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY); + } + + return DAG.getSetCC(DL, VT, DAG.getNode(ISD::VECREDUCE_OR, DL, GRLenVT, Cmp), + DAG.getConstant(0, DL, GRLenVT), CC); +} + +static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const LoongArchSubtarget &Subtarget) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + + if (SDValue V = combineTruncZExtAndSetcc(N, DAG, VT, CC, DL)) + return V; + + if (!isIntEqualitySetCC(CC)) + return SDValue(); + + if (SDValue V = + combineVectorSizedSetCCEquality(VT, N0, N1, CC, DL, DAG, Subtarget)) + return V; + + return SDValue(); +} + // Combine (loongarch_bitrev_w (loongarch_revb_2w X)) to loongarch_bitrev_4b. static SDValue performBITREV_WCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp index 5107c8def3799..74ffdf961e68f 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp @@ -122,12 +122,17 @@ LoongArchTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { Options.NumLoadsPerBlock = Options.MaxNumLoads; Options.AllowOverlappingLoads = true; - // TODO: Support for vectors. + if (IsZeroCmp && ST->hasExtLSX()) { + if (ST->hasExtLASX()) + Options.LoadSizes.push_back(32); + Options.LoadSizes.push_back(16); + } + if (ST->is64Bit()) { - Options.LoadSizes = {8, 4, 2, 1}; + Options.LoadSizes.append({8, 4, 2, 1}); Options.AllowedTailExpansions = {3, 5, 6}; } else { - Options.LoadSizes = {4, 2, 1}; + Options.LoadSizes.append({4, 2, 1}); Options.AllowedTailExpansions = {3}; } diff --git a/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll b/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll index eb070aa98f4ad..141ddab5344bd 100644 --- a/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll +++ b/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll @@ -498,33 +498,25 @@ define signext i32 @bcmp_size_16(ptr %s1, ptr %s2) nounwind optsize { ; ; LA32-UAL-V-LABEL: bcmp_size_16: ; LA32-UAL-V: # %bb.0: # %entry -; LA32-UAL-V-NEXT: ld.w $a2, $a0, 0 -; LA32-UAL-V-NEXT: ld.w $a3, $a1, 0 -; LA32-UAL-V-NEXT: ld.w $a4, $a0, 4 -; LA32-UAL-V-NEXT: ld.w $a5, $a1, 4 -; LA32-UAL-V-NEXT: ld.w $a6, $a0, 8 -; LA32-UAL-V-NEXT: ld.w $a7, $a1, 8 -; LA32-UAL-V-NEXT: ld.w $a0, $a0, 12 -; LA32-UAL-V-NEXT: ld.w $a1, $a1, 12 -; LA32-UAL-V-NEXT: xor $a2, $a2, $a3 -; LA32-UAL-V-NEXT: xor $a3, $a4, $a5 -; LA32-UAL-V-NEXT: xor $a4, $a6, $a7 -; LA32-UAL-V-NEXT: xor $a0, $a0, $a1 -; LA32-UAL-V-NEXT: or $a1, $a2, $a3 -; LA32-UAL-V-NEXT: or $a0, $a4, $a0 -; LA32-UAL-V-NEXT: or $a0, $a1, $a0 +; LA32-UAL-V-NEXT: vld $vr0, $a0, 0 +; LA32-UAL-V-NEXT: vld $vr1, $a1, 0 +; LA32-UAL-V-NEXT: vxor.v $vr0, $vr0, $vr1 +; LA32-UAL-V-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-UAL-V-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-UAL-V-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-UAL-V-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-UAL-V-NEXT: vpickve2gr.w $a0, $vr0, 0 ; LA32-UAL-V-NEXT: sltu $a0, $zero, $a0 ; LA32-UAL-V-NEXT: ret ; ; LA64-UAL-V-LABEL: bcmp_size_16: ; LA64-UAL-V: # %bb.0: # %entry -; LA64-UAL-V-NEXT: ld.d $a2, $a0, 0 -; LA64-UAL-V-NEXT: ld.d $a3, $a1, 0 -; LA64-UAL-V-NEXT: ld.d $a0, $a0, 8 -; LA64-UAL-V-NEXT: ld.d $a1, $a1, 8 -; LA64-UAL-V-NEXT: xor $a2, $a2, $a3 -; LA64-UAL-V-NEXT: xor $a0, $a0, $a1 -; LA64-UAL-V-NEXT: or $a0, $a2, $a0 +; LA64-UAL-V-NEXT: vld $vr0, $a0, 0 +; LA64-UAL-V-NEXT: vld $vr1, $a1, 0 +; LA64-UAL-V-NEXT: vxor.v $vr0, $vr0, $vr1 +; LA64-UAL-V-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-UAL-V-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-UAL-V-NEXT: vpickve2gr.d $a0, $vr0, 0 ; LA64-UAL-V-NEXT: sltu $a0, $zero, $a0 ; LA64-UAL-V-NEXT: ret ; @@ -554,15 +546,15 @@ entry: } define signext i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize { -; LA32-LABEL: bcmp_size_31: -; LA32: # %bb.0: # %entry -; LA32-NEXT: addi.w $sp, $sp, -16 -; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill -; LA32-NEXT: ori $a2, $zero, 31 -; LA32-NEXT: bl bcmp -; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload -; LA32-NEXT: addi.w $sp, $sp, 16 -; LA32-NEXT: ret +; LA32-UAL-LABEL: bcmp_size_31: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: addi.w $sp, $sp, -16 +; LA32-UAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-UAL-NEXT: ori $a2, $zero, 31 +; LA32-UAL-NEXT: bl bcmp +; LA32-UAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-UAL-NEXT: addi.w $sp, $sp, 16 +; LA32-UAL-NEXT: ret ; ; LA64-UAL-LABEL: bcmp_size_31: ; LA64-UAL: # %bb.0: # %entry @@ -584,26 +576,48 @@ define signext i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize { ; LA64-UAL-NEXT: sltu $a0, $zero, $a0 ; LA64-UAL-NEXT: ret ; +; LA32-UAL-V-LABEL: bcmp_size_31: +; LA32-UAL-V: # %bb.0: # %entry +; LA32-UAL-V-NEXT: vld $vr0, $a0, 0 +; LA32-UAL-V-NEXT: vld $vr1, $a0, 15 +; LA32-UAL-V-NEXT: vld $vr2, $a1, 15 +; LA32-UAL-V-NEXT: vld $vr3, $a1, 0 +; LA32-UAL-V-NEXT: vxor.v $vr1, $vr1, $vr2 +; LA32-UAL-V-NEXT: vxor.v $vr0, $vr0, $vr3 +; LA32-UAL-V-NEXT: vor.v $vr0, $vr0, $vr1 +; LA32-UAL-V-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-UAL-V-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-UAL-V-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-UAL-V-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-UAL-V-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-UAL-V-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-V-NEXT: ret +; ; LA64-UAL-V-LABEL: bcmp_size_31: ; LA64-UAL-V: # %bb.0: # %entry -; LA64-UAL-V-NEXT: ld.d $a2, $a0, 0 -; LA64-UAL-V-NEXT: ld.d $a3, $a1, 0 -; LA64-UAL-V-NEXT: ld.d $a4, $a0, 8 -; LA64-UAL-V-NEXT: ld.d $a5, $a1, 8 -; LA64-UAL-V-NEXT: ld.d $a6, $a0, 16 -; LA64-UAL-V-NEXT: ld.d $a7, $a1, 16 -; LA64-UAL-V-NEXT: ld.d $a0, $a0, 23 -; LA64-UAL-V-NEXT: ld.d $a1, $a1, 23 -; LA64-UAL-V-NEXT: xor $a2, $a2, $a3 -; LA64-UAL-V-NEXT: xor $a3, $a4, $a5 -; LA64-UAL-V-NEXT: xor $a4, $a6, $a7 -; LA64-UAL-V-NEXT: xor $a0, $a0, $a1 -; LA64-UAL-V-NEXT: or $a1, $a2, $a3 -; LA64-UAL-V-NEXT: or $a0, $a4, $a0 -; LA64-UAL-V-NEXT: or $a0, $a1, $a0 +; LA64-UAL-V-NEXT: vld $vr0, $a0, 0 +; LA64-UAL-V-NEXT: vld $vr1, $a0, 15 +; LA64-UAL-V-NEXT: vld $vr2, $a1, 15 +; LA64-UAL-V-NEXT: vld $vr3, $a1, 0 +; LA64-UAL-V-NEXT: vxor.v $vr1, $vr1, $vr2 +; LA64-UAL-V-NEXT: vxor.v $vr0, $vr0, $vr3 +; LA64-UAL-V-NEXT: vor.v $vr0, $vr0, $vr1 +; LA64-UAL-V-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-UAL-V-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-UAL-V-NEXT: vpickve2gr.d $a0, $vr0, 0 ; LA64-UAL-V-NEXT: sltu $a0, $zero, $a0 ; LA64-UAL-V-NEXT: ret ; +; LA32-NUAL-LABEL: bcmp_size_31: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 31 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; ; LA64-NUAL-LABEL: bcmp_size_31: ; LA64-NUAL: # %bb.0: # %entry ; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 @@ -620,15 +634,15 @@ entry: } define signext i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind optsize { -; LA32-LABEL: bcmp_size_32: -; LA32: # %bb.0: # %entry -; LA32-NEXT: addi.w $sp, $sp, -16 -; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill -; LA32-NEXT: ori $a2, $zero, 32 -; LA32-NEXT: bl bcmp -; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload -; LA32-NEXT: addi.w $sp, $sp, 16 -; LA32-NEXT: ret +; LA32-UAL-LABEL: bcmp_size_32: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: addi.w $sp, $sp, -16 +; LA32-UAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-UAL-NEXT: ori $a2, $zero, 32 +; LA32-UAL-NEXT: bl bcmp +; LA32-UAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-UAL-NEXT: addi.w $sp, $sp, 16 +; LA32-UAL-NEXT: ret ; ; LA64-UAL-LABEL: bcmp_size_32: ; LA64-UAL: # %bb.0: # %entry @@ -650,25 +664,75 @@ define signext i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind optsize { ; LA64-UAL-NEXT: sltu $a0, $zero, $a0 ; LA64-UAL-NEXT: ret ; -; LA64-UAL-V-LABEL: bcmp_size_32: -; LA64-UAL-V: # %bb.0: # %entry -; LA64-UAL-V-NEXT: ld.d $a2, $a0, 0 -; LA64-UAL-V-NEXT: ld.d $a3, $a1, 0 -; LA64-UAL-V-NEXT: ld.d $a4, $a0, 8 -; LA64-UAL-V-NEXT: ld.d $a5, $a1, 8 -; LA64-UAL-V-NEXT: ld.d $a6, $a0, 16 -; LA64-UAL-V-NEXT: ld.d $a7, $a1, 16 -; LA64-UAL-V-NEXT: ld.d $a0, $a0, 24 -; LA64-UAL-V-NEXT: ld.d $a1, $a1, 24 -; LA64-UAL-V-NEXT: xor $a2, $a2, $a3 -; LA64-UAL-V-NEXT: xor $a3, $a4, $a5 -; LA64-UAL-V-NEXT: xor $a4, $a6, $a7 -; LA64-UAL-V-NEXT: xor $a0, $a0, $a1 -; LA64-UAL-V-NEXT: or $a1, $a2, $a3 -; LA64-UAL-V-NEXT: or $a0, $a4, $a0 -; LA64-UAL-V-NEXT: or $a0, $a1, $a0 -; LA64-UAL-V-NEXT: sltu $a0, $zero, $a0 -; LA64-UAL-V-NEXT: ret +; LA32-UAL-LSX-LABEL: bcmp_size_32: +; LA32-UAL-LSX: # %bb.0: # %entry +; LA32-UAL-LSX-NEXT: vld $vr0, $a0, 0 +; LA32-UAL-LSX-NEXT: vld $vr1, $a0, 16 +; LA32-UAL-LSX-NEXT: vld $vr2, $a1, 16 +; LA32-UAL-LSX-NEXT: vld $vr3, $a1, 0 +; LA32-UAL-LSX-NEXT: vxor.v $vr1, $vr1, $vr2 +; LA32-UAL-LSX-NEXT: vxor.v $vr0, $vr0, $vr3 +; LA32-UAL-LSX-NEXT: vor.v $vr0, $vr0, $vr1 +; LA32-UAL-LSX-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-UAL-LSX-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-UAL-LSX-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-UAL-LSX-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-UAL-LSX-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-UAL-LSX-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-LSX-NEXT: ret +; +; LA64-UAL-LSX-LABEL: bcmp_size_32: +; LA64-UAL-LSX: # %bb.0: # %entry +; LA64-UAL-LSX-NEXT: vld $vr0, $a0, 0 +; LA64-UAL-LSX-NEXT: vld $vr1, $a0, 16 +; LA64-UAL-LSX-NEXT: vld $vr2, $a1, 16 +; LA64-UAL-LSX-NEXT: vld $vr3, $a1, 0 +; LA64-UAL-LSX-NEXT: vxor.v $vr1, $vr1, $vr2 +; LA64-UAL-LSX-NEXT: vxor.v $vr0, $vr0, $vr3 +; LA64-UAL-LSX-NEXT: vor.v $vr0, $vr0, $vr1 +; LA64-UAL-LSX-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-UAL-LSX-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-UAL-LSX-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-UAL-LSX-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-LSX-NEXT: ret +; +; LA32-UAL-LASX-LABEL: bcmp_size_32: +; LA32-UAL-LASX: # %bb.0: # %entry +; LA32-UAL-LASX-NEXT: xvld $xr0, $a0, 0 +; LA32-UAL-LASX-NEXT: xvld $xr1, $a1, 0 +; LA32-UAL-LASX-NEXT: xvxor.v $xr0, $xr0, $xr1 +; LA32-UAL-LASX-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-UAL-LASX-NEXT: vor.v $vr0, $vr0, $vr1 +; LA32-UAL-LASX-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-UAL-LASX-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-UAL-LASX-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-UAL-LASX-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-UAL-LASX-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-UAL-LASX-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-LASX-NEXT: ret +; +; LA64-UAL-LASX-LABEL: bcmp_size_32: +; LA64-UAL-LASX: # %bb.0: # %entry +; LA64-UAL-LASX-NEXT: xvld $xr0, $a0, 0 +; LA64-UAL-LASX-NEXT: xvld $xr1, $a1, 0 +; LA64-UAL-LASX-NEXT: xvxor.v $xr0, $xr0, $xr1 +; LA64-UAL-LASX-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-UAL-LASX-NEXT: vor.v $vr0, $vr0, $vr1 +; LA64-UAL-LASX-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-UAL-LASX-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-UAL-LASX-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-UAL-LASX-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-LASX-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_size_32: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 32 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret ; ; LA64-NUAL-LABEL: bcmp_size_32: ; LA64-NUAL: # %bb.0: # %entry @@ -686,104 +750,502 @@ entry: } define signext i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind optsize { -; LA32-LABEL: bcmp_size_63: -; LA32: # %bb.0: # %entry -; LA32-NEXT: addi.w $sp, $sp, -16 -; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill -; LA32-NEXT: ori $a2, $zero, 63 -; LA32-NEXT: bl bcmp -; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload -; LA32-NEXT: addi.w $sp, $sp, 16 -; LA32-NEXT: ret +; LA32-UAL-LABEL: bcmp_size_63: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: addi.w $sp, $sp, -16 +; LA32-UAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-UAL-NEXT: ori $a2, $zero, 63 +; LA32-UAL-NEXT: bl bcmp +; LA32-UAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-UAL-NEXT: addi.w $sp, $sp, 16 +; LA32-UAL-NEXT: ret ; -; LA64-LABEL: bcmp_size_63: -; LA64: # %bb.0: # %entry -; LA64-NEXT: addi.d $sp, $sp, -16 -; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill -; LA64-NEXT: ori $a2, $zero, 63 -; LA64-NEXT: pcaddu18i $ra, %call36(bcmp) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload -; LA64-NEXT: addi.d $sp, $sp, 16 -; LA64-NEXT: ret +; LA64-UAL-LABEL: bcmp_size_63: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: addi.d $sp, $sp, -16 +; LA64-UAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-UAL-NEXT: ori $a2, $zero, 63 +; LA64-UAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-UAL-NEXT: jirl $ra, $ra, 0 +; LA64-UAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-UAL-NEXT: addi.d $sp, $sp, 16 +; LA64-UAL-NEXT: ret +; +; LA32-UAL-LSX-LABEL: bcmp_size_63: +; LA32-UAL-LSX: # %bb.0: # %entry +; LA32-UAL-LSX-NEXT: vld $vr0, $a0, 0 +; LA32-UAL-LSX-NEXT: vld $vr1, $a1, 0 +; LA32-UAL-LSX-NEXT: vld $vr2, $a0, 32 +; LA32-UAL-LSX-NEXT: vld $vr3, $a0, 47 +; LA32-UAL-LSX-NEXT: vld $vr4, $a1, 47 +; LA32-UAL-LSX-NEXT: vld $vr5, $a1, 32 +; LA32-UAL-LSX-NEXT: vld $vr6, $a0, 16 +; LA32-UAL-LSX-NEXT: vld $vr7, $a1, 16 +; LA32-UAL-LSX-NEXT: vxor.v $vr3, $vr3, $vr4 +; LA32-UAL-LSX-NEXT: vxor.v $vr2, $vr2, $vr5 +; LA32-UAL-LSX-NEXT: vor.v $vr2, $vr2, $vr3 +; LA32-UAL-LSX-NEXT: vxor.v $vr3, $vr6, $vr7 +; LA32-UAL-LSX-NEXT: vxor.v $vr0, $vr0, $vr1 +; LA32-UAL-LSX-NEXT: vor.v $vr0, $vr0, $vr3 +; LA32-UAL-LSX-NEXT: vor.v $vr0, $vr0, $vr2 +; LA32-UAL-LSX-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-UAL-LSX-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-UAL-LSX-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-UAL-LSX-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-UAL-LSX-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-UAL-LSX-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-LSX-NEXT: ret +; +; LA64-UAL-LSX-LABEL: bcmp_size_63: +; LA64-UAL-LSX: # %bb.0: # %entry +; LA64-UAL-LSX-NEXT: vld $vr0, $a0, 0 +; LA64-UAL-LSX-NEXT: vld $vr1, $a1, 0 +; LA64-UAL-LSX-NEXT: vld $vr2, $a0, 32 +; LA64-UAL-LSX-NEXT: vld $vr3, $a0, 47 +; LA64-UAL-LSX-NEXT: vld $vr4, $a1, 47 +; LA64-UAL-LSX-NEXT: vld $vr5, $a1, 32 +; LA64-UAL-LSX-NEXT: vld $vr6, $a0, 16 +; LA64-UAL-LSX-NEXT: vld $vr7, $a1, 16 +; LA64-UAL-LSX-NEXT: vxor.v $vr3, $vr3, $vr4 +; LA64-UAL-... [truncated] |
This commit enables memcmp expansion for lsx/lasx. After doing
this, i128 and i256 loads which are illegal types on LoongArch
will be generated. Without process, they will be splited to
legal scalar type.
So this commit also enable combination for
setccto bitcasti128/i256 types to vector types before type legalization and
generate vector instructions.
Inspired by x86 and riscv.