Skip to content

Conversation

@zhaoqi5
Copy link
Contributor

@zhaoqi5 zhaoqi5 commented Nov 13, 2025

This commit enables memcmp expansion for lsx/lasx. After doing
this, i128 and i256 loads which are illegal types on LoongArch
will be generated. Without process, they will be splited to
legal scalar type.

So this commit also enable combination for setcc to bitcast
i128/i256 types to vector types before type legalization and
generate vector instructions.

Inspired by x86 and riscv.

…/i256 setcc This commit enables memcmp expansion for lsx/lasx. After doing this, i128 and i256 loads which are illegal types on LoongArch will be generated. Without process, they will be splited to legal scalar type. So this commit also enable combination for `setcc` to bitcast i128/i256 types to vector types before type legalization and generate vector instructions. Inspired by x86 and riscv.
@llvmbot
Copy link
Member

llvmbot commented Nov 13, 2025

@llvm/pr-subscribers-backend-loongarch

Author: ZhaoQi (zhaoqi5)

Changes

This commit enables memcmp expansion for lsx/lasx. After doing
this, i128 and i256 loads which are illegal types on LoongArch
will be generated. Without process, they will be splited to
legal scalar type.

So this commit also enable combination for setcc to bitcast
i128/i256 types to vector types before type legalization and
generate vector instructions.

Inspired by x86 and riscv.


Patch is 83.62 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/167828.diff

4 Files Affected:

  • (modified) llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp (+114-8)
  • (modified) llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp (+8-3)
  • (modified) llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll (+612-155)
  • (modified) llvm/test/CodeGen/LoongArch/expandmemcmp.ll (+671-298)
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp index cf4ffc82f6009..d7d820e4505fd 100644 --- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp @@ -5733,11 +5733,8 @@ static bool checkValueWidth(SDValue V, ISD::LoadExtType &ExtType) { // +-------------+ // | CMP | // +-------------+ -static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG, - TargetLowering::DAGCombinerInfo &DCI, - const LoongArchSubtarget &Subtarget) { - ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); - +static SDValue combineTruncZExtAndSetcc(SDNode *N, SelectionDAG &DAG, EVT VT, + ISD::CondCode CC, const SDLoc &DL) { SDNode *AndNode = N->getOperand(0).getNode(); if (AndNode->getOpcode() != ISD::AND) return SDValue(); @@ -5793,14 +5790,123 @@ static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG, return SDValue(); // These truncation and zero-extension nodes are not necessary, remove them. - SDValue NewAnd = DAG.getNode(ISD::AND, SDLoc(N), AndNode->getValueType(0), + SDValue NewAnd = DAG.getNode(ISD::AND, DL, AndNode->getValueType(0), TruncInputValue1, TruncInputValue2); - SDValue NewSetCC = - DAG.getSetCC(SDLoc(N), N->getValueType(0), NewAnd, TruncInputValue2, CC); + SDValue NewSetCC = DAG.getSetCC(DL, VT, NewAnd, TruncInputValue2, CC); DAG.ReplaceAllUsesWith(N, NewSetCC.getNode()); return SDValue(N, 0); } +/// Recursive helper for combineVectorSizedSetCCEquality() to see if we have a +/// recognizable memcmp expansion. +static bool isOrXorXorTree(SDValue X, bool Root = true) { + if (X.getOpcode() == ISD::OR) + return isOrXorXorTree(X.getOperand(0), false) && + isOrXorXorTree(X.getOperand(1), false); + if (Root) + return false; + return X.getOpcode() == ISD::XOR; +} + +/// Recursive helper for combineVectorSizedSetCCEquality() to emit the memcmp +/// expansion. +static SDValue emitOrXorXorTree(SDValue X, const SDLoc &DL, SelectionDAG &DAG, + EVT VecVT) { + SDValue Op0 = X.getOperand(0); + SDValue Op1 = X.getOperand(1); + if (X.getOpcode() == ISD::OR) { + SDValue A = emitOrXorXorTree(Op0, DL, DAG, VecVT); + SDValue B = emitOrXorXorTree(Op1, DL, DAG, VecVT); + return DAG.getNode(ISD::OR, DL, VecVT, A, B); + } + if (X.getOpcode() == ISD::XOR) { + SDValue A = DAG.getBitcast(VecVT, Op0); + SDValue B = DAG.getBitcast(VecVT, Op1); + return DAG.getNode(ISD::XOR, DL, VecVT, A, B); + } + llvm_unreachable("Impossible"); +} + +/// Try to map a 128-bit or 256-bit integer comparison to vector instructions +/// before type legalization splits it up into chunks. +static SDValue +combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y, ISD::CondCode CC, + const SDLoc &DL, SelectionDAG &DAG, + const LoongArchSubtarget &Subtarget) { + assert(isIntEqualitySetCC(CC) && "Bad comparison predicate"); + + EVT OpVT = X.getValueType(); + unsigned OpSize = OpVT.getSizeInBits(); + MVT GRLenVT = Subtarget.getGRLenVT(); + + // We're looking for an oversized integer equality comparison. + if (!OpVT.isScalarInteger()) + return SDValue(); + + if (!(OpSize == 128 && Subtarget.hasExtLSX()) && + !(OpSize == 256 && Subtarget.hasExtLASX())) + return SDValue(); + + if (DAG.getMachineFunction().getFunction().hasFnAttribute( + Attribute::NoImplicitFloat)) + return SDValue(); + + // Check if this is a bitwise-combined equality comparison of 2 pairs of + // vectors: + // setcc i128 (or (xor A, B), (xor C, D)), 0, eq|ne + bool IsOrXorXorTreeCCZero = isNullConstant(Y) && isOrXorXorTree(X); + + // Don't perform this combine if constructing the vector will be expensive. + auto IsVectorBitCastCheap = [](SDValue X) { + X = peekThroughBitcasts(X); + return isa<ConstantSDNode>(X) || X.getValueType().isVector() || + X.getOpcode() == ISD::LOAD; + }; + + if ((!IsVectorBitCastCheap(X) || !IsVectorBitCastCheap(Y)) && + !IsOrXorXorTreeCCZero) + return SDValue(); + + // Treat as v2i64/v4i64 on LA64 and v4i32/v8i32 on LA32. + unsigned VecSize = OpSize / (Subtarget.is64Bit() ? 64 : 32); + EVT VecVT = + MVT::getVectorVT(Subtarget.is64Bit() ? MVT::i64 : MVT::i32, VecSize); + + SDValue Cmp; + if (IsOrXorXorTreeCCZero) { + Cmp = emitOrXorXorTree(X, DL, DAG, VecVT); + } else { + SDValue VecX = DAG.getBitcast(VecVT, X); + SDValue VecY = DAG.getBitcast(VecVT, Y); + Cmp = DAG.getNode(ISD::XOR, DL, VecVT, VecX, VecY); + } + + return DAG.getSetCC(DL, VT, DAG.getNode(ISD::VECREDUCE_OR, DL, GRLenVT, Cmp), + DAG.getConstant(0, DL, GRLenVT), CC); +} + +static SDValue performSETCCCombine(SDNode *N, SelectionDAG &DAG, + TargetLowering::DAGCombinerInfo &DCI, + const LoongArchSubtarget &Subtarget) { + SDLoc DL(N); + EVT VT = N->getValueType(0); + SDValue N0 = N->getOperand(0); + SDValue N1 = N->getOperand(1); + ISD::CondCode CC = cast<CondCodeSDNode>(N->getOperand(2))->get(); + + if (SDValue V = combineTruncZExtAndSetcc(N, DAG, VT, CC, DL)) + return V; + + if (!isIntEqualitySetCC(CC)) + return SDValue(); + + if (SDValue V = + combineVectorSizedSetCCEquality(VT, N0, N1, CC, DL, DAG, Subtarget)) + return V; + + return SDValue(); +} + // Combine (loongarch_bitrev_w (loongarch_revb_2w X)) to loongarch_bitrev_4b. static SDValue performBITREV_WCombine(SDNode *N, SelectionDAG &DAG, TargetLowering::DAGCombinerInfo &DCI, diff --git a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp index 5107c8def3799..74ffdf961e68f 100644 --- a/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp +++ b/llvm/lib/Target/LoongArch/LoongArchTargetTransformInfo.cpp @@ -122,12 +122,17 @@ LoongArchTTIImpl::enableMemCmpExpansion(bool OptSize, bool IsZeroCmp) const { Options.NumLoadsPerBlock = Options.MaxNumLoads; Options.AllowOverlappingLoads = true; - // TODO: Support for vectors. + if (IsZeroCmp && ST->hasExtLSX()) { + if (ST->hasExtLASX()) + Options.LoadSizes.push_back(32); + Options.LoadSizes.push_back(16); + } + if (ST->is64Bit()) { - Options.LoadSizes = {8, 4, 2, 1}; + Options.LoadSizes.append({8, 4, 2, 1}); Options.AllowedTailExpansions = {3, 5, 6}; } else { - Options.LoadSizes = {4, 2, 1}; + Options.LoadSizes.append({4, 2, 1}); Options.AllowedTailExpansions = {3}; } diff --git a/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll b/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll index eb070aa98f4ad..141ddab5344bd 100644 --- a/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll +++ b/llvm/test/CodeGen/LoongArch/expandmemcmp-optsize.ll @@ -498,33 +498,25 @@ define signext i32 @bcmp_size_16(ptr %s1, ptr %s2) nounwind optsize { ; ; LA32-UAL-V-LABEL: bcmp_size_16: ; LA32-UAL-V: # %bb.0: # %entry -; LA32-UAL-V-NEXT: ld.w $a2, $a0, 0 -; LA32-UAL-V-NEXT: ld.w $a3, $a1, 0 -; LA32-UAL-V-NEXT: ld.w $a4, $a0, 4 -; LA32-UAL-V-NEXT: ld.w $a5, $a1, 4 -; LA32-UAL-V-NEXT: ld.w $a6, $a0, 8 -; LA32-UAL-V-NEXT: ld.w $a7, $a1, 8 -; LA32-UAL-V-NEXT: ld.w $a0, $a0, 12 -; LA32-UAL-V-NEXT: ld.w $a1, $a1, 12 -; LA32-UAL-V-NEXT: xor $a2, $a2, $a3 -; LA32-UAL-V-NEXT: xor $a3, $a4, $a5 -; LA32-UAL-V-NEXT: xor $a4, $a6, $a7 -; LA32-UAL-V-NEXT: xor $a0, $a0, $a1 -; LA32-UAL-V-NEXT: or $a1, $a2, $a3 -; LA32-UAL-V-NEXT: or $a0, $a4, $a0 -; LA32-UAL-V-NEXT: or $a0, $a1, $a0 +; LA32-UAL-V-NEXT: vld $vr0, $a0, 0 +; LA32-UAL-V-NEXT: vld $vr1, $a1, 0 +; LA32-UAL-V-NEXT: vxor.v $vr0, $vr0, $vr1 +; LA32-UAL-V-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-UAL-V-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-UAL-V-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-UAL-V-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-UAL-V-NEXT: vpickve2gr.w $a0, $vr0, 0 ; LA32-UAL-V-NEXT: sltu $a0, $zero, $a0 ; LA32-UAL-V-NEXT: ret ; ; LA64-UAL-V-LABEL: bcmp_size_16: ; LA64-UAL-V: # %bb.0: # %entry -; LA64-UAL-V-NEXT: ld.d $a2, $a0, 0 -; LA64-UAL-V-NEXT: ld.d $a3, $a1, 0 -; LA64-UAL-V-NEXT: ld.d $a0, $a0, 8 -; LA64-UAL-V-NEXT: ld.d $a1, $a1, 8 -; LA64-UAL-V-NEXT: xor $a2, $a2, $a3 -; LA64-UAL-V-NEXT: xor $a0, $a0, $a1 -; LA64-UAL-V-NEXT: or $a0, $a2, $a0 +; LA64-UAL-V-NEXT: vld $vr0, $a0, 0 +; LA64-UAL-V-NEXT: vld $vr1, $a1, 0 +; LA64-UAL-V-NEXT: vxor.v $vr0, $vr0, $vr1 +; LA64-UAL-V-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-UAL-V-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-UAL-V-NEXT: vpickve2gr.d $a0, $vr0, 0 ; LA64-UAL-V-NEXT: sltu $a0, $zero, $a0 ; LA64-UAL-V-NEXT: ret ; @@ -554,15 +546,15 @@ entry: } define signext i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize { -; LA32-LABEL: bcmp_size_31: -; LA32: # %bb.0: # %entry -; LA32-NEXT: addi.w $sp, $sp, -16 -; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill -; LA32-NEXT: ori $a2, $zero, 31 -; LA32-NEXT: bl bcmp -; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload -; LA32-NEXT: addi.w $sp, $sp, 16 -; LA32-NEXT: ret +; LA32-UAL-LABEL: bcmp_size_31: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: addi.w $sp, $sp, -16 +; LA32-UAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-UAL-NEXT: ori $a2, $zero, 31 +; LA32-UAL-NEXT: bl bcmp +; LA32-UAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-UAL-NEXT: addi.w $sp, $sp, 16 +; LA32-UAL-NEXT: ret ; ; LA64-UAL-LABEL: bcmp_size_31: ; LA64-UAL: # %bb.0: # %entry @@ -584,26 +576,48 @@ define signext i32 @bcmp_size_31(ptr %s1, ptr %s2) nounwind optsize { ; LA64-UAL-NEXT: sltu $a0, $zero, $a0 ; LA64-UAL-NEXT: ret ; +; LA32-UAL-V-LABEL: bcmp_size_31: +; LA32-UAL-V: # %bb.0: # %entry +; LA32-UAL-V-NEXT: vld $vr0, $a0, 0 +; LA32-UAL-V-NEXT: vld $vr1, $a0, 15 +; LA32-UAL-V-NEXT: vld $vr2, $a1, 15 +; LA32-UAL-V-NEXT: vld $vr3, $a1, 0 +; LA32-UAL-V-NEXT: vxor.v $vr1, $vr1, $vr2 +; LA32-UAL-V-NEXT: vxor.v $vr0, $vr0, $vr3 +; LA32-UAL-V-NEXT: vor.v $vr0, $vr0, $vr1 +; LA32-UAL-V-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-UAL-V-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-UAL-V-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-UAL-V-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-UAL-V-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-UAL-V-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-V-NEXT: ret +; ; LA64-UAL-V-LABEL: bcmp_size_31: ; LA64-UAL-V: # %bb.0: # %entry -; LA64-UAL-V-NEXT: ld.d $a2, $a0, 0 -; LA64-UAL-V-NEXT: ld.d $a3, $a1, 0 -; LA64-UAL-V-NEXT: ld.d $a4, $a0, 8 -; LA64-UAL-V-NEXT: ld.d $a5, $a1, 8 -; LA64-UAL-V-NEXT: ld.d $a6, $a0, 16 -; LA64-UAL-V-NEXT: ld.d $a7, $a1, 16 -; LA64-UAL-V-NEXT: ld.d $a0, $a0, 23 -; LA64-UAL-V-NEXT: ld.d $a1, $a1, 23 -; LA64-UAL-V-NEXT: xor $a2, $a2, $a3 -; LA64-UAL-V-NEXT: xor $a3, $a4, $a5 -; LA64-UAL-V-NEXT: xor $a4, $a6, $a7 -; LA64-UAL-V-NEXT: xor $a0, $a0, $a1 -; LA64-UAL-V-NEXT: or $a1, $a2, $a3 -; LA64-UAL-V-NEXT: or $a0, $a4, $a0 -; LA64-UAL-V-NEXT: or $a0, $a1, $a0 +; LA64-UAL-V-NEXT: vld $vr0, $a0, 0 +; LA64-UAL-V-NEXT: vld $vr1, $a0, 15 +; LA64-UAL-V-NEXT: vld $vr2, $a1, 15 +; LA64-UAL-V-NEXT: vld $vr3, $a1, 0 +; LA64-UAL-V-NEXT: vxor.v $vr1, $vr1, $vr2 +; LA64-UAL-V-NEXT: vxor.v $vr0, $vr0, $vr3 +; LA64-UAL-V-NEXT: vor.v $vr0, $vr0, $vr1 +; LA64-UAL-V-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-UAL-V-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-UAL-V-NEXT: vpickve2gr.d $a0, $vr0, 0 ; LA64-UAL-V-NEXT: sltu $a0, $zero, $a0 ; LA64-UAL-V-NEXT: ret ; +; LA32-NUAL-LABEL: bcmp_size_31: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 31 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret +; ; LA64-NUAL-LABEL: bcmp_size_31: ; LA64-NUAL: # %bb.0: # %entry ; LA64-NUAL-NEXT: addi.d $sp, $sp, -16 @@ -620,15 +634,15 @@ entry: } define signext i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind optsize { -; LA32-LABEL: bcmp_size_32: -; LA32: # %bb.0: # %entry -; LA32-NEXT: addi.w $sp, $sp, -16 -; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill -; LA32-NEXT: ori $a2, $zero, 32 -; LA32-NEXT: bl bcmp -; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload -; LA32-NEXT: addi.w $sp, $sp, 16 -; LA32-NEXT: ret +; LA32-UAL-LABEL: bcmp_size_32: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: addi.w $sp, $sp, -16 +; LA32-UAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-UAL-NEXT: ori $a2, $zero, 32 +; LA32-UAL-NEXT: bl bcmp +; LA32-UAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-UAL-NEXT: addi.w $sp, $sp, 16 +; LA32-UAL-NEXT: ret ; ; LA64-UAL-LABEL: bcmp_size_32: ; LA64-UAL: # %bb.0: # %entry @@ -650,25 +664,75 @@ define signext i32 @bcmp_size_32(ptr %s1, ptr %s2) nounwind optsize { ; LA64-UAL-NEXT: sltu $a0, $zero, $a0 ; LA64-UAL-NEXT: ret ; -; LA64-UAL-V-LABEL: bcmp_size_32: -; LA64-UAL-V: # %bb.0: # %entry -; LA64-UAL-V-NEXT: ld.d $a2, $a0, 0 -; LA64-UAL-V-NEXT: ld.d $a3, $a1, 0 -; LA64-UAL-V-NEXT: ld.d $a4, $a0, 8 -; LA64-UAL-V-NEXT: ld.d $a5, $a1, 8 -; LA64-UAL-V-NEXT: ld.d $a6, $a0, 16 -; LA64-UAL-V-NEXT: ld.d $a7, $a1, 16 -; LA64-UAL-V-NEXT: ld.d $a0, $a0, 24 -; LA64-UAL-V-NEXT: ld.d $a1, $a1, 24 -; LA64-UAL-V-NEXT: xor $a2, $a2, $a3 -; LA64-UAL-V-NEXT: xor $a3, $a4, $a5 -; LA64-UAL-V-NEXT: xor $a4, $a6, $a7 -; LA64-UAL-V-NEXT: xor $a0, $a0, $a1 -; LA64-UAL-V-NEXT: or $a1, $a2, $a3 -; LA64-UAL-V-NEXT: or $a0, $a4, $a0 -; LA64-UAL-V-NEXT: or $a0, $a1, $a0 -; LA64-UAL-V-NEXT: sltu $a0, $zero, $a0 -; LA64-UAL-V-NEXT: ret +; LA32-UAL-LSX-LABEL: bcmp_size_32: +; LA32-UAL-LSX: # %bb.0: # %entry +; LA32-UAL-LSX-NEXT: vld $vr0, $a0, 0 +; LA32-UAL-LSX-NEXT: vld $vr1, $a0, 16 +; LA32-UAL-LSX-NEXT: vld $vr2, $a1, 16 +; LA32-UAL-LSX-NEXT: vld $vr3, $a1, 0 +; LA32-UAL-LSX-NEXT: vxor.v $vr1, $vr1, $vr2 +; LA32-UAL-LSX-NEXT: vxor.v $vr0, $vr0, $vr3 +; LA32-UAL-LSX-NEXT: vor.v $vr0, $vr0, $vr1 +; LA32-UAL-LSX-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-UAL-LSX-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-UAL-LSX-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-UAL-LSX-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-UAL-LSX-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-UAL-LSX-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-LSX-NEXT: ret +; +; LA64-UAL-LSX-LABEL: bcmp_size_32: +; LA64-UAL-LSX: # %bb.0: # %entry +; LA64-UAL-LSX-NEXT: vld $vr0, $a0, 0 +; LA64-UAL-LSX-NEXT: vld $vr1, $a0, 16 +; LA64-UAL-LSX-NEXT: vld $vr2, $a1, 16 +; LA64-UAL-LSX-NEXT: vld $vr3, $a1, 0 +; LA64-UAL-LSX-NEXT: vxor.v $vr1, $vr1, $vr2 +; LA64-UAL-LSX-NEXT: vxor.v $vr0, $vr0, $vr3 +; LA64-UAL-LSX-NEXT: vor.v $vr0, $vr0, $vr1 +; LA64-UAL-LSX-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-UAL-LSX-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-UAL-LSX-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-UAL-LSX-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-LSX-NEXT: ret +; +; LA32-UAL-LASX-LABEL: bcmp_size_32: +; LA32-UAL-LASX: # %bb.0: # %entry +; LA32-UAL-LASX-NEXT: xvld $xr0, $a0, 0 +; LA32-UAL-LASX-NEXT: xvld $xr1, $a1, 0 +; LA32-UAL-LASX-NEXT: xvxor.v $xr0, $xr0, $xr1 +; LA32-UAL-LASX-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA32-UAL-LASX-NEXT: vor.v $vr0, $vr0, $vr1 +; LA32-UAL-LASX-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-UAL-LASX-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-UAL-LASX-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-UAL-LASX-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-UAL-LASX-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-UAL-LASX-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-LASX-NEXT: ret +; +; LA64-UAL-LASX-LABEL: bcmp_size_32: +; LA64-UAL-LASX: # %bb.0: # %entry +; LA64-UAL-LASX-NEXT: xvld $xr0, $a0, 0 +; LA64-UAL-LASX-NEXT: xvld $xr1, $a1, 0 +; LA64-UAL-LASX-NEXT: xvxor.v $xr0, $xr0, $xr1 +; LA64-UAL-LASX-NEXT: xvpermi.q $xr1, $xr0, 1 +; LA64-UAL-LASX-NEXT: vor.v $vr0, $vr0, $vr1 +; LA64-UAL-LASX-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA64-UAL-LASX-NEXT: vor.v $vr0, $vr1, $vr0 +; LA64-UAL-LASX-NEXT: vpickve2gr.d $a0, $vr0, 0 +; LA64-UAL-LASX-NEXT: sltu $a0, $zero, $a0 +; LA64-UAL-LASX-NEXT: ret +; +; LA32-NUAL-LABEL: bcmp_size_32: +; LA32-NUAL: # %bb.0: # %entry +; LA32-NUAL-NEXT: addi.w $sp, $sp, -16 +; LA32-NUAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-NUAL-NEXT: ori $a2, $zero, 32 +; LA32-NUAL-NEXT: bl bcmp +; LA32-NUAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-NUAL-NEXT: addi.w $sp, $sp, 16 +; LA32-NUAL-NEXT: ret ; ; LA64-NUAL-LABEL: bcmp_size_32: ; LA64-NUAL: # %bb.0: # %entry @@ -686,104 +750,502 @@ entry: } define signext i32 @bcmp_size_63(ptr %s1, ptr %s2) nounwind optsize { -; LA32-LABEL: bcmp_size_63: -; LA32: # %bb.0: # %entry -; LA32-NEXT: addi.w $sp, $sp, -16 -; LA32-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill -; LA32-NEXT: ori $a2, $zero, 63 -; LA32-NEXT: bl bcmp -; LA32-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload -; LA32-NEXT: addi.w $sp, $sp, 16 -; LA32-NEXT: ret +; LA32-UAL-LABEL: bcmp_size_63: +; LA32-UAL: # %bb.0: # %entry +; LA32-UAL-NEXT: addi.w $sp, $sp, -16 +; LA32-UAL-NEXT: st.w $ra, $sp, 12 # 4-byte Folded Spill +; LA32-UAL-NEXT: ori $a2, $zero, 63 +; LA32-UAL-NEXT: bl bcmp +; LA32-UAL-NEXT: ld.w $ra, $sp, 12 # 4-byte Folded Reload +; LA32-UAL-NEXT: addi.w $sp, $sp, 16 +; LA32-UAL-NEXT: ret ; -; LA64-LABEL: bcmp_size_63: -; LA64: # %bb.0: # %entry -; LA64-NEXT: addi.d $sp, $sp, -16 -; LA64-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill -; LA64-NEXT: ori $a2, $zero, 63 -; LA64-NEXT: pcaddu18i $ra, %call36(bcmp) -; LA64-NEXT: jirl $ra, $ra, 0 -; LA64-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload -; LA64-NEXT: addi.d $sp, $sp, 16 -; LA64-NEXT: ret +; LA64-UAL-LABEL: bcmp_size_63: +; LA64-UAL: # %bb.0: # %entry +; LA64-UAL-NEXT: addi.d $sp, $sp, -16 +; LA64-UAL-NEXT: st.d $ra, $sp, 8 # 8-byte Folded Spill +; LA64-UAL-NEXT: ori $a2, $zero, 63 +; LA64-UAL-NEXT: pcaddu18i $ra, %call36(bcmp) +; LA64-UAL-NEXT: jirl $ra, $ra, 0 +; LA64-UAL-NEXT: ld.d $ra, $sp, 8 # 8-byte Folded Reload +; LA64-UAL-NEXT: addi.d $sp, $sp, 16 +; LA64-UAL-NEXT: ret +; +; LA32-UAL-LSX-LABEL: bcmp_size_63: +; LA32-UAL-LSX: # %bb.0: # %entry +; LA32-UAL-LSX-NEXT: vld $vr0, $a0, 0 +; LA32-UAL-LSX-NEXT: vld $vr1, $a1, 0 +; LA32-UAL-LSX-NEXT: vld $vr2, $a0, 32 +; LA32-UAL-LSX-NEXT: vld $vr3, $a0, 47 +; LA32-UAL-LSX-NEXT: vld $vr4, $a1, 47 +; LA32-UAL-LSX-NEXT: vld $vr5, $a1, 32 +; LA32-UAL-LSX-NEXT: vld $vr6, $a0, 16 +; LA32-UAL-LSX-NEXT: vld $vr7, $a1, 16 +; LA32-UAL-LSX-NEXT: vxor.v $vr3, $vr3, $vr4 +; LA32-UAL-LSX-NEXT: vxor.v $vr2, $vr2, $vr5 +; LA32-UAL-LSX-NEXT: vor.v $vr2, $vr2, $vr3 +; LA32-UAL-LSX-NEXT: vxor.v $vr3, $vr6, $vr7 +; LA32-UAL-LSX-NEXT: vxor.v $vr0, $vr0, $vr1 +; LA32-UAL-LSX-NEXT: vor.v $vr0, $vr0, $vr3 +; LA32-UAL-LSX-NEXT: vor.v $vr0, $vr0, $vr2 +; LA32-UAL-LSX-NEXT: vbsrl.v $vr1, $vr0, 8 +; LA32-UAL-LSX-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-UAL-LSX-NEXT: vbsrl.v $vr1, $vr0, 4 +; LA32-UAL-LSX-NEXT: vor.v $vr0, $vr1, $vr0 +; LA32-UAL-LSX-NEXT: vpickve2gr.w $a0, $vr0, 0 +; LA32-UAL-LSX-NEXT: sltu $a0, $zero, $a0 +; LA32-UAL-LSX-NEXT: ret +; +; LA64-UAL-LSX-LABEL: bcmp_size_63: +; LA64-UAL-LSX: # %bb.0: # %entry +; LA64-UAL-LSX-NEXT: vld $vr0, $a0, 0 +; LA64-UAL-LSX-NEXT: vld $vr1, $a1, 0 +; LA64-UAL-LSX-NEXT: vld $vr2, $a0, 32 +; LA64-UAL-LSX-NEXT: vld $vr3, $a0, 47 +; LA64-UAL-LSX-NEXT: vld $vr4, $a1, 47 +; LA64-UAL-LSX-NEXT: vld $vr5, $a1, 32 +; LA64-UAL-LSX-NEXT: vld $vr6, $a0, 16 +; LA64-UAL-LSX-NEXT: vld $vr7, $a1, 16 +; LA64-UAL-LSX-NEXT: vxor.v $vr3, $vr3, $vr4 +; LA64-UAL-... [truncated] 
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

3 participants