[WebAssembly] Support promoting lower lanes of f16x8 to f32x4. #129786

brendandahl · 2025-03-04T22:18:12Z

No description provided.

llvmbot · 2025-03-04T22:18:45Z

@llvm/pr-subscribers-backend-x86

@llvm/pr-subscribers-clang

Author: Brendan Dahl (brendandahl)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/129786.diff

6 Files Affected:

(modified) clang/lib/Headers/wasm_simd128.h (+8)
(modified) cross-project-tests/intrinsic-header-tests/wasm_simd128.c (+6)
(modified) llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp (+31-14)
(modified) llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td (+1)
(modified) llvm/test/CodeGen/WebAssembly/half-precision.ll (+20)
(modified) llvm/test/MC/WebAssembly/simd-encodings.s (+3)

diff --git a/clang/lib/Headers/wasm_simd128.h b/clang/lib/Headers/wasm_simd128.h index 08e39bf1a79b4..c509d7841135e 100644 --- a/clang/lib/Headers/wasm_simd128.h +++ b/clang/lib/Headers/wasm_simd128.h @@ -45,6 +45,7 @@ typedef int __i32x2 __attribute__((__vector_size__(8), __aligned__(8))); typedef unsigned int __u32x2 __attribute__((__vector_size__(8), __aligned__(8))); typedef float __f32x2 __attribute__((__vector_size__(8), __aligned__(8))); +typedef __fp16 __f16x4 __attribute__((__vector_size__(8), __aligned__(8))); #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, __target__("simd128"), \ @@ -2010,6 +2011,13 @@ static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_convert_u16x8(v128_t __a) { return (v128_t) __builtin_convertvector((__u16x8)__a, __f16x8); } +static __inline__ v128_t __FP16_FN_ATTRS +wasm_f32x4_promote_low_f16x8(v128_t __a) { + return (v128_t) __builtin_convertvector( + (__f16x4){((__f16x8)__a)[0], ((__f16x8)__a)[1], + ((__f16x8)__a)[2], ((__f16x8)__a)[3]}, __f32x4); +} + static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_relaxed_madd(v128_t __a, v128_t __b, v128_t __c) { diff --git a/cross-project-tests/intrinsic-header-tests/wasm_simd128.c b/cross-project-tests/intrinsic-header-tests/wasm_simd128.c index b601d90cfcc92..1f4809483589e 100644 --- a/cross-project-tests/intrinsic-header-tests/wasm_simd128.c +++ b/cross-project-tests/intrinsic-header-tests/wasm_simd128.c @@ -1033,6 +1033,12 @@ v128_t test_f64x2_promote_low_f32x4(v128_t a) { return wasm_f64x2_promote_low_f32x4(a); } +// CHECK-LABEL: test_f32x4_promote_low_f16x8: +// CHECK: f32x4.promote_low_f16x8{{$}} +v128_t test_f32x4_promote_low_f16x8(v128_t a) { + return wasm_f32x4_promote_low_f16x8(a); +} + // CHECK-LABEL: test_i8x16_shuffle: // CHECK: i8x16.shuffle 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, // 0{{$}} diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index b24a45c2d8898..4a034ed508cfe 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -2341,7 +2341,7 @@ WebAssemblyTargetLowering::LowerEXTEND_VECTOR_INREG(SDValue Op, static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); - if (Op.getValueType() != MVT::v2f64) + if (Op.getValueType() != MVT::v2f64 && Op.getValueType() != MVT::v4f32) return SDValue(); auto GetConvertedLane = [](SDValue Op, unsigned &Opcode, SDValue &SrcVec, @@ -2354,6 +2354,7 @@ static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) { Opcode = WebAssemblyISD::CONVERT_LOW_U; break; case ISD::FP_EXTEND: + case ISD::FP16_TO_FP: Opcode = WebAssemblyISD::PROMOTE_LOW; break; default: @@ -2372,36 +2373,52 @@ static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) { return true; }; - unsigned LHSOpcode, RHSOpcode, LHSIndex, RHSIndex; - SDValue LHSSrcVec, RHSSrcVec; - if (!GetConvertedLane(Op.getOperand(0), LHSOpcode, LHSSrcVec, LHSIndex) || - !GetConvertedLane(Op.getOperand(1), RHSOpcode, RHSSrcVec, RHSIndex)) + unsigned NumLanes = Op.getValueType() == MVT::v2f64 ? 2 : 4; + unsigned FirstOpcode = 0, SecondOpcode = 0, ThirdOpcode = 0, FourthOpcode = 0; + unsigned FirstIndex = 0, SecondIndex = 0, ThirdIndex = 0, FourthIndex = 0; + SDValue FirstSrcVec, SecondSrcVec, ThirdSrcVec, FourthSrcVec; + + if (!GetConvertedLane(Op.getOperand(0), FirstOpcode, FirstSrcVec, FirstIndex) || + !GetConvertedLane(Op.getOperand(1), SecondOpcode, SecondSrcVec, SecondIndex)) + return SDValue(); + + // If we're converting to v4f32, check the third and fourth lanes, too. + if (NumLanes == 4 && (!GetConvertedLane(Op.getOperand(2), ThirdOpcode, ThirdSrcVec, ThirdIndex) || + !GetConvertedLane(Op.getOperand(3), FourthOpcode, FourthSrcVec, FourthIndex))) + return SDValue(); + + if (FirstOpcode != SecondOpcode) return SDValue(); - if (LHSOpcode != RHSOpcode) + // TODO Add an optimization similar to the v2f64 below for shuffling the + // vectors when the lanes are in the wrong order or come from different src + // vectors. + if (NumLanes == 4 && (FirstOpcode != ThirdOpcode || FirstOpcode != FourthOpcode || + FirstSrcVec != SecondSrcVec || FirstSrcVec != ThirdSrcVec || FirstSrcVec != FourthSrcVec || + FirstIndex != 0 || SecondIndex != 1 || ThirdIndex != 2 || FourthIndex != 3)) return SDValue(); MVT ExpectedSrcVT; - switch (LHSOpcode) { + switch (FirstOpcode) { case WebAssemblyISD::CONVERT_LOW_S: case WebAssemblyISD::CONVERT_LOW_U: ExpectedSrcVT = MVT::v4i32; break; case WebAssemblyISD::PROMOTE_LOW: - ExpectedSrcVT = MVT::v4f32; + ExpectedSrcVT = NumLanes == 2 ? MVT::v4f32 : MVT::v8i16; break; } - if (LHSSrcVec.getValueType() != ExpectedSrcVT) + if (FirstSrcVec.getValueType() != ExpectedSrcVT) return SDValue(); - auto Src = LHSSrcVec; - if (LHSIndex != 0 || RHSIndex != 1 || LHSSrcVec != RHSSrcVec) { + auto Src = FirstSrcVec; + if (NumLanes == 2 && (FirstIndex != 0 || SecondIndex != 1 || FirstSrcVec != SecondSrcVec)) { // Shuffle the source vector so that the converted lanes are the low lanes. Src = DAG.getVectorShuffle( - ExpectedSrcVT, DL, LHSSrcVec, RHSSrcVec, - {static_cast<int>(LHSIndex), static_cast<int>(RHSIndex) + 4, -1, -1}); + ExpectedSrcVT, DL, FirstSrcVec, SecondSrcVec, + {static_cast<int>(FirstIndex), static_cast<int>(SecondIndex) + 4, -1, -1}); } - return DAG.getNode(LHSOpcode, DL, MVT::v2f64, Src); + return DAG.getNode(FirstOpcode, DL, NumLanes == 2 ? MVT::v2f64 : MVT::v4f32, Src); } SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index c591e5ef181a4..d2d62b8b62c3e 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1468,6 +1468,7 @@ defm "" : SIMDConvert<F32x4, F64x2, demote_zero, def promote_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>; def promote_low : SDNode<"WebAssemblyISD::PROMOTE_LOW", promote_t>; defm "" : SIMDConvert<F64x2, F32x4, promote_low, "promote_low_f32x4", 0x5f>; +defm "" : HalfPrecisionConvert<F32x4, I16x8, promote_low, "promote_low_f16x8", 0x14b>; // Lower extending loads to load64_zero + promote_low def extloadv2f32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> { diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll index 4e8ff5955c63b..f0e23ea289265 100644 --- a/llvm/test/CodeGen/WebAssembly/half-precision.ll +++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll @@ -369,3 +369,23 @@ define <8 x half> @shuffle_poison_v8f16(<8 x half> %x, <8 x half> %y) { i32 poison, i32 poison, i32 poison, i32 poison> ret <8 x half> %res } + +define <4 x float> @promote_low_v4f32(<8 x half> %x) { +; CHECK-LABEL: promote_low_v4f32: +; CHECK: .functype promote_low_v4f32 (v128) -> (v128){{$}} +; CHECK-NEXT: f32x4.promote_low_f16x8 $push[[R:[0-9]+]]=, $0 +; CHECK-NEXT: return $pop[[R]] + %v = shufflevector <8 x half> %x, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %a = fpext <4 x half> %v to <4 x float> + ret <4 x float> %a +} + +define <4 x float> @promote_low_v4f32_2(<8 x half> %x) { +; CHECK-LABEL: promote_low_v4f32_2: +; CHECK: .functype promote_low_v4f32_2 (v128) -> (v128) +; CHECK-NEXT: f32x4.promote_low_f16x8 $push[[R:[0-9]+]]=, $0 +; CHECK-NEXT: return $pop[[R]] + %v = fpext <8 x half> %x to <8 x float> + %a = shufflevector <8 x float> %v, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x float> %a +} diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s index 48aec4bc52a0c..57af1daad0226 100644 --- a/llvm/test/MC/WebAssembly/simd-encodings.s +++ b/llvm/test/MC/WebAssembly/simd-encodings.s @@ -935,4 +935,7 @@ main: # CHECK: f16x8.convert_i16x8_u # encoding: [0xfd,0xc8,0x02] f16x8.convert_i16x8_u + # CHECK: f32x4.promote_low_f16x8 # encoding: [0xfd,0xcb,0x02] + f32x4.promote_low_f16x8 + end_function

llvmbot · 2025-03-04T22:18:45Z

@llvm/pr-subscribers-backend-webassembly

Author: Brendan Dahl (brendandahl)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/129786.diff

6 Files Affected:

(modified) clang/lib/Headers/wasm_simd128.h (+8)
(modified) cross-project-tests/intrinsic-header-tests/wasm_simd128.c (+6)
(modified) llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp (+31-14)
(modified) llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td (+1)
(modified) llvm/test/CodeGen/WebAssembly/half-precision.ll (+20)
(modified) llvm/test/MC/WebAssembly/simd-encodings.s (+3)

diff --git a/clang/lib/Headers/wasm_simd128.h b/clang/lib/Headers/wasm_simd128.h index 08e39bf1a79b4..c509d7841135e 100644 --- a/clang/lib/Headers/wasm_simd128.h +++ b/clang/lib/Headers/wasm_simd128.h @@ -45,6 +45,7 @@ typedef int __i32x2 __attribute__((__vector_size__(8), __aligned__(8))); typedef unsigned int __u32x2 __attribute__((__vector_size__(8), __aligned__(8))); typedef float __f32x2 __attribute__((__vector_size__(8), __aligned__(8))); +typedef __fp16 __f16x4 __attribute__((__vector_size__(8), __aligned__(8))); #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, __target__("simd128"), \ @@ -2010,6 +2011,13 @@ static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_convert_u16x8(v128_t __a) { return (v128_t) __builtin_convertvector((__u16x8)__a, __f16x8); } +static __inline__ v128_t __FP16_FN_ATTRS +wasm_f32x4_promote_low_f16x8(v128_t __a) { + return (v128_t) __builtin_convertvector( + (__f16x4){((__f16x8)__a)[0], ((__f16x8)__a)[1], + ((__f16x8)__a)[2], ((__f16x8)__a)[3]}, __f32x4); +} + static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_relaxed_madd(v128_t __a, v128_t __b, v128_t __c) { diff --git a/cross-project-tests/intrinsic-header-tests/wasm_simd128.c b/cross-project-tests/intrinsic-header-tests/wasm_simd128.c index b601d90cfcc92..1f4809483589e 100644 --- a/cross-project-tests/intrinsic-header-tests/wasm_simd128.c +++ b/cross-project-tests/intrinsic-header-tests/wasm_simd128.c @@ -1033,6 +1033,12 @@ v128_t test_f64x2_promote_low_f32x4(v128_t a) { return wasm_f64x2_promote_low_f32x4(a); } +// CHECK-LABEL: test_f32x4_promote_low_f16x8: +// CHECK: f32x4.promote_low_f16x8{{$}} +v128_t test_f32x4_promote_low_f16x8(v128_t a) { + return wasm_f32x4_promote_low_f16x8(a); +} + // CHECK-LABEL: test_i8x16_shuffle: // CHECK: i8x16.shuffle 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, // 0{{$}} diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index b24a45c2d8898..4a034ed508cfe 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -2341,7 +2341,7 @@ WebAssemblyTargetLowering::LowerEXTEND_VECTOR_INREG(SDValue Op, static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); - if (Op.getValueType() != MVT::v2f64) + if (Op.getValueType() != MVT::v2f64 && Op.getValueType() != MVT::v4f32) return SDValue(); auto GetConvertedLane = [](SDValue Op, unsigned &Opcode, SDValue &SrcVec, @@ -2354,6 +2354,7 @@ static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) { Opcode = WebAssemblyISD::CONVERT_LOW_U; break; case ISD::FP_EXTEND: + case ISD::FP16_TO_FP: Opcode = WebAssemblyISD::PROMOTE_LOW; break; default: @@ -2372,36 +2373,52 @@ static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) { return true; }; - unsigned LHSOpcode, RHSOpcode, LHSIndex, RHSIndex; - SDValue LHSSrcVec, RHSSrcVec; - if (!GetConvertedLane(Op.getOperand(0), LHSOpcode, LHSSrcVec, LHSIndex) || - !GetConvertedLane(Op.getOperand(1), RHSOpcode, RHSSrcVec, RHSIndex)) + unsigned NumLanes = Op.getValueType() == MVT::v2f64 ? 2 : 4; + unsigned FirstOpcode = 0, SecondOpcode = 0, ThirdOpcode = 0, FourthOpcode = 0; + unsigned FirstIndex = 0, SecondIndex = 0, ThirdIndex = 0, FourthIndex = 0; + SDValue FirstSrcVec, SecondSrcVec, ThirdSrcVec, FourthSrcVec; + + if (!GetConvertedLane(Op.getOperand(0), FirstOpcode, FirstSrcVec, FirstIndex) || + !GetConvertedLane(Op.getOperand(1), SecondOpcode, SecondSrcVec, SecondIndex)) + return SDValue(); + + // If we're converting to v4f32, check the third and fourth lanes, too. + if (NumLanes == 4 && (!GetConvertedLane(Op.getOperand(2), ThirdOpcode, ThirdSrcVec, ThirdIndex) || + !GetConvertedLane(Op.getOperand(3), FourthOpcode, FourthSrcVec, FourthIndex))) + return SDValue(); + + if (FirstOpcode != SecondOpcode) return SDValue(); - if (LHSOpcode != RHSOpcode) + // TODO Add an optimization similar to the v2f64 below for shuffling the + // vectors when the lanes are in the wrong order or come from different src + // vectors. + if (NumLanes == 4 && (FirstOpcode != ThirdOpcode || FirstOpcode != FourthOpcode || + FirstSrcVec != SecondSrcVec || FirstSrcVec != ThirdSrcVec || FirstSrcVec != FourthSrcVec || + FirstIndex != 0 || SecondIndex != 1 || ThirdIndex != 2 || FourthIndex != 3)) return SDValue(); MVT ExpectedSrcVT; - switch (LHSOpcode) { + switch (FirstOpcode) { case WebAssemblyISD::CONVERT_LOW_S: case WebAssemblyISD::CONVERT_LOW_U: ExpectedSrcVT = MVT::v4i32; break; case WebAssemblyISD::PROMOTE_LOW: - ExpectedSrcVT = MVT::v4f32; + ExpectedSrcVT = NumLanes == 2 ? MVT::v4f32 : MVT::v8i16; break; } - if (LHSSrcVec.getValueType() != ExpectedSrcVT) + if (FirstSrcVec.getValueType() != ExpectedSrcVT) return SDValue(); - auto Src = LHSSrcVec; - if (LHSIndex != 0 || RHSIndex != 1 || LHSSrcVec != RHSSrcVec) { + auto Src = FirstSrcVec; + if (NumLanes == 2 && (FirstIndex != 0 || SecondIndex != 1 || FirstSrcVec != SecondSrcVec)) { // Shuffle the source vector so that the converted lanes are the low lanes. Src = DAG.getVectorShuffle( - ExpectedSrcVT, DL, LHSSrcVec, RHSSrcVec, - {static_cast<int>(LHSIndex), static_cast<int>(RHSIndex) + 4, -1, -1}); + ExpectedSrcVT, DL, FirstSrcVec, SecondSrcVec, + {static_cast<int>(FirstIndex), static_cast<int>(SecondIndex) + 4, -1, -1}); } - return DAG.getNode(LHSOpcode, DL, MVT::v2f64, Src); + return DAG.getNode(FirstOpcode, DL, NumLanes == 2 ? MVT::v2f64 : MVT::v4f32, Src); } SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index c591e5ef181a4..d2d62b8b62c3e 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1468,6 +1468,7 @@ defm "" : SIMDConvert<F32x4, F64x2, demote_zero, def promote_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>; def promote_low : SDNode<"WebAssemblyISD::PROMOTE_LOW", promote_t>; defm "" : SIMDConvert<F64x2, F32x4, promote_low, "promote_low_f32x4", 0x5f>; +defm "" : HalfPrecisionConvert<F32x4, I16x8, promote_low, "promote_low_f16x8", 0x14b>; // Lower extending loads to load64_zero + promote_low def extloadv2f32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> { diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll index 4e8ff5955c63b..f0e23ea289265 100644 --- a/llvm/test/CodeGen/WebAssembly/half-precision.ll +++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll @@ -369,3 +369,23 @@ define <8 x half> @shuffle_poison_v8f16(<8 x half> %x, <8 x half> %y) { i32 poison, i32 poison, i32 poison, i32 poison> ret <8 x half> %res } + +define <4 x float> @promote_low_v4f32(<8 x half> %x) { +; CHECK-LABEL: promote_low_v4f32: +; CHECK: .functype promote_low_v4f32 (v128) -> (v128){{$}} +; CHECK-NEXT: f32x4.promote_low_f16x8 $push[[R:[0-9]+]]=, $0 +; CHECK-NEXT: return $pop[[R]] + %v = shufflevector <8 x half> %x, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %a = fpext <4 x half> %v to <4 x float> + ret <4 x float> %a +} + +define <4 x float> @promote_low_v4f32_2(<8 x half> %x) { +; CHECK-LABEL: promote_low_v4f32_2: +; CHECK: .functype promote_low_v4f32_2 (v128) -> (v128) +; CHECK-NEXT: f32x4.promote_low_f16x8 $push[[R:[0-9]+]]=, $0 +; CHECK-NEXT: return $pop[[R]] + %v = fpext <8 x half> %x to <8 x float> + %a = shufflevector <8 x float> %v, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x float> %a +} diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s index 48aec4bc52a0c..57af1daad0226 100644 --- a/llvm/test/MC/WebAssembly/simd-encodings.s +++ b/llvm/test/MC/WebAssembly/simd-encodings.s @@ -935,4 +935,7 @@ main: # CHECK: f16x8.convert_i16x8_u # encoding: [0xfd,0xc8,0x02] f16x8.convert_i16x8_u + # CHECK: f32x4.promote_low_f16x8 # encoding: [0xfd,0xcb,0x02] + f32x4.promote_low_f16x8 + end_function

llvmbot · 2025-03-04T22:18:46Z

@llvm/pr-subscribers-mc

Author: Brendan Dahl (brendandahl)

Changes

Full diff: https://github.com/llvm/llvm-project/pull/129786.diff

6 Files Affected:

(modified) clang/lib/Headers/wasm_simd128.h (+8)
(modified) cross-project-tests/intrinsic-header-tests/wasm_simd128.c (+6)
(modified) llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp (+31-14)
(modified) llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td (+1)
(modified) llvm/test/CodeGen/WebAssembly/half-precision.ll (+20)
(modified) llvm/test/MC/WebAssembly/simd-encodings.s (+3)

diff --git a/clang/lib/Headers/wasm_simd128.h b/clang/lib/Headers/wasm_simd128.h index 08e39bf1a79b4..c509d7841135e 100644 --- a/clang/lib/Headers/wasm_simd128.h +++ b/clang/lib/Headers/wasm_simd128.h @@ -45,6 +45,7 @@ typedef int __i32x2 __attribute__((__vector_size__(8), __aligned__(8))); typedef unsigned int __u32x2 __attribute__((__vector_size__(8), __aligned__(8))); typedef float __f32x2 __attribute__((__vector_size__(8), __aligned__(8))); +typedef __fp16 __f16x4 __attribute__((__vector_size__(8), __aligned__(8))); #define __DEFAULT_FN_ATTRS \ __attribute__((__always_inline__, __nodebug__, __target__("simd128"), \ @@ -2010,6 +2011,13 @@ static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_convert_u16x8(v128_t __a) { return (v128_t) __builtin_convertvector((__u16x8)__a, __f16x8); } +static __inline__ v128_t __FP16_FN_ATTRS +wasm_f32x4_promote_low_f16x8(v128_t __a) { + return (v128_t) __builtin_convertvector( + (__f16x4){((__f16x8)__a)[0], ((__f16x8)__a)[1], + ((__f16x8)__a)[2], ((__f16x8)__a)[3]}, __f32x4); +} + static __inline__ v128_t __FP16_FN_ATTRS wasm_f16x8_relaxed_madd(v128_t __a, v128_t __b, v128_t __c) { diff --git a/cross-project-tests/intrinsic-header-tests/wasm_simd128.c b/cross-project-tests/intrinsic-header-tests/wasm_simd128.c index b601d90cfcc92..1f4809483589e 100644 --- a/cross-project-tests/intrinsic-header-tests/wasm_simd128.c +++ b/cross-project-tests/intrinsic-header-tests/wasm_simd128.c @@ -1033,6 +1033,12 @@ v128_t test_f64x2_promote_low_f32x4(v128_t a) { return wasm_f64x2_promote_low_f32x4(a); } +// CHECK-LABEL: test_f32x4_promote_low_f16x8: +// CHECK: f32x4.promote_low_f16x8{{$}} +v128_t test_f32x4_promote_low_f16x8(v128_t a) { + return wasm_f32x4_promote_low_f16x8(a); +} + // CHECK-LABEL: test_i8x16_shuffle: // CHECK: i8x16.shuffle 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, // 0{{$}} diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp index b24a45c2d8898..4a034ed508cfe 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp +++ b/llvm/lib/Target/WebAssembly/WebAssemblyISelLowering.cpp @@ -2341,7 +2341,7 @@ WebAssemblyTargetLowering::LowerEXTEND_VECTOR_INREG(SDValue Op, static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) { SDLoc DL(Op); - if (Op.getValueType() != MVT::v2f64) + if (Op.getValueType() != MVT::v2f64 && Op.getValueType() != MVT::v4f32) return SDValue(); auto GetConvertedLane = [](SDValue Op, unsigned &Opcode, SDValue &SrcVec, @@ -2354,6 +2354,7 @@ static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) { Opcode = WebAssemblyISD::CONVERT_LOW_U; break; case ISD::FP_EXTEND: + case ISD::FP16_TO_FP: Opcode = WebAssemblyISD::PROMOTE_LOW; break; default: @@ -2372,36 +2373,52 @@ static SDValue LowerConvertLow(SDValue Op, SelectionDAG &DAG) { return true; }; - unsigned LHSOpcode, RHSOpcode, LHSIndex, RHSIndex; - SDValue LHSSrcVec, RHSSrcVec; - if (!GetConvertedLane(Op.getOperand(0), LHSOpcode, LHSSrcVec, LHSIndex) || - !GetConvertedLane(Op.getOperand(1), RHSOpcode, RHSSrcVec, RHSIndex)) + unsigned NumLanes = Op.getValueType() == MVT::v2f64 ? 2 : 4; + unsigned FirstOpcode = 0, SecondOpcode = 0, ThirdOpcode = 0, FourthOpcode = 0; + unsigned FirstIndex = 0, SecondIndex = 0, ThirdIndex = 0, FourthIndex = 0; + SDValue FirstSrcVec, SecondSrcVec, ThirdSrcVec, FourthSrcVec; + + if (!GetConvertedLane(Op.getOperand(0), FirstOpcode, FirstSrcVec, FirstIndex) || + !GetConvertedLane(Op.getOperand(1), SecondOpcode, SecondSrcVec, SecondIndex)) + return SDValue(); + + // If we're converting to v4f32, check the third and fourth lanes, too. + if (NumLanes == 4 && (!GetConvertedLane(Op.getOperand(2), ThirdOpcode, ThirdSrcVec, ThirdIndex) || + !GetConvertedLane(Op.getOperand(3), FourthOpcode, FourthSrcVec, FourthIndex))) + return SDValue(); + + if (FirstOpcode != SecondOpcode) return SDValue(); - if (LHSOpcode != RHSOpcode) + // TODO Add an optimization similar to the v2f64 below for shuffling the + // vectors when the lanes are in the wrong order or come from different src + // vectors. + if (NumLanes == 4 && (FirstOpcode != ThirdOpcode || FirstOpcode != FourthOpcode || + FirstSrcVec != SecondSrcVec || FirstSrcVec != ThirdSrcVec || FirstSrcVec != FourthSrcVec || + FirstIndex != 0 || SecondIndex != 1 || ThirdIndex != 2 || FourthIndex != 3)) return SDValue(); MVT ExpectedSrcVT; - switch (LHSOpcode) { + switch (FirstOpcode) { case WebAssemblyISD::CONVERT_LOW_S: case WebAssemblyISD::CONVERT_LOW_U: ExpectedSrcVT = MVT::v4i32; break; case WebAssemblyISD::PROMOTE_LOW: - ExpectedSrcVT = MVT::v4f32; + ExpectedSrcVT = NumLanes == 2 ? MVT::v4f32 : MVT::v8i16; break; } - if (LHSSrcVec.getValueType() != ExpectedSrcVT) + if (FirstSrcVec.getValueType() != ExpectedSrcVT) return SDValue(); - auto Src = LHSSrcVec; - if (LHSIndex != 0 || RHSIndex != 1 || LHSSrcVec != RHSSrcVec) { + auto Src = FirstSrcVec; + if (NumLanes == 2 && (FirstIndex != 0 || SecondIndex != 1 || FirstSrcVec != SecondSrcVec)) { // Shuffle the source vector so that the converted lanes are the low lanes. Src = DAG.getVectorShuffle( - ExpectedSrcVT, DL, LHSSrcVec, RHSSrcVec, - {static_cast<int>(LHSIndex), static_cast<int>(RHSIndex) + 4, -1, -1}); + ExpectedSrcVT, DL, FirstSrcVec, SecondSrcVec, + {static_cast<int>(FirstIndex), static_cast<int>(SecondIndex) + 4, -1, -1}); } - return DAG.getNode(LHSOpcode, DL, MVT::v2f64, Src); + return DAG.getNode(FirstOpcode, DL, NumLanes == 2 ? MVT::v2f64 : MVT::v4f32, Src); } SDValue WebAssemblyTargetLowering::LowerBUILD_VECTOR(SDValue Op, diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td index c591e5ef181a4..d2d62b8b62c3e 100644 --- a/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td +++ b/llvm/lib/Target/WebAssembly/WebAssemblyInstrSIMD.td @@ -1468,6 +1468,7 @@ defm "" : SIMDConvert<F32x4, F64x2, demote_zero, def promote_t : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVec<1>]>; def promote_low : SDNode<"WebAssemblyISD::PROMOTE_LOW", promote_t>; defm "" : SIMDConvert<F64x2, F32x4, promote_low, "promote_low_f32x4", 0x5f>; +defm "" : HalfPrecisionConvert<F32x4, I16x8, promote_low, "promote_low_f16x8", 0x14b>; // Lower extending loads to load64_zero + promote_low def extloadv2f32 : PatFrag<(ops node:$ptr), (extload node:$ptr)> { diff --git a/llvm/test/CodeGen/WebAssembly/half-precision.ll b/llvm/test/CodeGen/WebAssembly/half-precision.ll index 4e8ff5955c63b..f0e23ea289265 100644 --- a/llvm/test/CodeGen/WebAssembly/half-precision.ll +++ b/llvm/test/CodeGen/WebAssembly/half-precision.ll @@ -369,3 +369,23 @@ define <8 x half> @shuffle_poison_v8f16(<8 x half> %x, <8 x half> %y) { i32 poison, i32 poison, i32 poison, i32 poison> ret <8 x half> %res } + +define <4 x float> @promote_low_v4f32(<8 x half> %x) { +; CHECK-LABEL: promote_low_v4f32: +; CHECK: .functype promote_low_v4f32 (v128) -> (v128){{$}} +; CHECK-NEXT: f32x4.promote_low_f16x8 $push[[R:[0-9]+]]=, $0 +; CHECK-NEXT: return $pop[[R]] + %v = shufflevector <8 x half> %x, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + %a = fpext <4 x half> %v to <4 x float> + ret <4 x float> %a +} + +define <4 x float> @promote_low_v4f32_2(<8 x half> %x) { +; CHECK-LABEL: promote_low_v4f32_2: +; CHECK: .functype promote_low_v4f32_2 (v128) -> (v128) +; CHECK-NEXT: f32x4.promote_low_f16x8 $push[[R:[0-9]+]]=, $0 +; CHECK-NEXT: return $pop[[R]] + %v = fpext <8 x half> %x to <8 x float> + %a = shufflevector <8 x float> %v, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3> + ret <4 x float> %a +} diff --git a/llvm/test/MC/WebAssembly/simd-encodings.s b/llvm/test/MC/WebAssembly/simd-encodings.s index 48aec4bc52a0c..57af1daad0226 100644 --- a/llvm/test/MC/WebAssembly/simd-encodings.s +++ b/llvm/test/MC/WebAssembly/simd-encodings.s @@ -935,4 +935,7 @@ main: # CHECK: f16x8.convert_i16x8_u # encoding: [0xfd,0xc8,0x02] f16x8.convert_i16x8_u + # CHECK: f32x4.promote_low_f16x8 # encoding: [0xfd,0xcb,0x02] + f32x4.promote_low_f16x8 + end_function

github-actions · 2025-03-04T22:21:40Z

✅ With the latest revision this PR passed the C/C++ code formatter.

github-actions · 2025-03-04T22:21:40Z

✅ With the latest revision this PR passed the undef deprecator.

dschuff · 2025-03-04T23:50:22Z

llvm/test/CodeGen/WebAssembly/half-precision.ll

+; CHECK: .functype promote_low_v4f32 (v128) -> (v128){{$}}
+; CHECK-NEXT: f32x4.promote_low_f16x8 $push[[R:[0-9]+]]=, $0
+; CHECK-NEXT: return $pop[[R]]
+ %v = shufflevector <8 x half> %x, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>


Suggested change

%v = shufflevector <8 x half> %x, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>

%v = shufflevector <8 x half> %x, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>

dschuff · 2025-03-04T23:52:27Z

llvm/test/CodeGen/WebAssembly/half-precision.ll

+; CHECK-NEXT: f32x4.promote_low_f16x8 $push[[R:[0-9]+]]=, $0
+; CHECK-NEXT: return $pop[[R]]
+ %v = fpext <8 x half> %x to <8 x float>
+ %a = shufflevector <8 x float> %v, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>


Suggested change

%a = shufflevector <8 x float> %v, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>

%a = shufflevector <8 x float> %v, <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>

llvmbot added clang Clang issues not falling into any other category backend:WebAssembly backend:X86 clang:headers Headers provided by Clang, e.g. for intrinsics llvm:mc Machine (object) code labels Mar 4, 2025

dschuff approved these changes Mar 4, 2025

View reviewed changes

[WebAssembly] Support promoting lower lanes of f16x8 to f32x4.

50ad443

brendandahl force-pushed the fp16-promote-low branch from 5ca39c1 to 50ad443 Compare March 5, 2025 00:01

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

[WebAssembly] Support promoting lower lanes of f16x8 to f32x4. #129786

[WebAssembly] Support promoting lower lanes of f16x8 to f32x4. #129786

Uh oh!

brendandahl commented Mar 4, 2025

llvmbot commented Mar 4, 2025 •

edited

Loading

llvmbot commented Mar 4, 2025

llvmbot commented Mar 4, 2025

github-actions bot commented Mar 4, 2025 •

edited

Loading

github-actions bot commented Mar 4, 2025 •

edited

Loading

dschuff Mar 4, 2025

dschuff Mar 4, 2025

Labels

3 participants

	%v = shufflevector <8 x half> %x, <8 x half> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	%v = shufflevector <8 x half> %x, <8 x half> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>

	%a = shufflevector <8 x float> %v, <8 x float> undef, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
	%a = shufflevector <8 x float> %v, <8 x float> poison, <4 x i32> <i32 0, i32 1, i32 2, i32 3>

[WebAssembly] Support promoting lower lanes of f16x8 to f32x4. #129786

Are you sure you want to change the base?

[WebAssembly] Support promoting lower lanes of f16x8 to f32x4. #129786

Uh oh!

Conversation

brendandahl commented Mar 4, 2025

llvmbot commented Mar 4, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

llvmbot commented Mar 4, 2025

llvmbot commented Mar 4, 2025

github-actions bot commented Mar 4, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

github-actions bot commented Mar 4, 2025 • edited Loading Uh oh! There was an error while loading. Please reload this page.

Uh oh!

dschuff Mar 4, 2025

Choose a reason for hiding this comment

dschuff Mar 4, 2025

Choose a reason for hiding this comment

Labels

3 participants

llvmbot commented Mar 4, 2025 •

edited

Loading

github-actions bot commented Mar 4, 2025 •

edited

Loading

github-actions bot commented Mar 4, 2025 •

edited

Loading