llvm
diff --git a/‎clang/include/clang/Basic/BuiltinsLoongArchLASX.def‎
Lines changed: 19 additions & 0 deletions b/‎clang/include/clang/Basic/BuiltinsLoongArchLASX.def‎
Lines changed: 19 additions & 0 deletions
diff --git a/‎clang/lib/Headers/lasxintrin.h‎
Lines changed: 110 additions & 0 deletions b/‎clang/lib/Headers/lasxintrin.h‎
Lines changed: 110 additions & 0 deletions
diff --git a/‎clang/test/CodeGen/LoongArch/lasx/builtin-alias.c‎
Lines changed: 153 additions & 0 deletions b/‎clang/test/CodeGen/LoongArch/lasx/builtin-alias.c‎
Lines changed: 153 additions & 0 deletions
@@ -986,3 +986,22 @@ TARGET_BUILTIN(__builtin_lasx_xbnz_b, "iV32Uc", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xbnz_h, "iV16Us", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xbnz_w, "iV8Ui", "nc", "lasx")
 TARGET_BUILTIN(__builtin_lasx_xbnz_d, "iV4ULLi", "nc", "lasx")
+
+TARGET_BUILTIN(__builtin_lasx_cast_128_s, "V8fV4f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_cast_128_d, "V4dV2d", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_cast_128, "V32ScV16Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_concat_128_s, "V8fV4fV4f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_concat_128_d, "V4dV2dV2d", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_concat_128, "V32ScV16ScV16Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_extract_128_lo_s, "V4fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_extract_128_lo_d, "V2dV4d", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_extract_128_lo, "V16ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_extract_128_hi_s, "V4fV8f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_extract_128_hi_d, "V2dV4d", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_extract_128_hi, "V16ScV32Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_insert_128_lo_s, "V8fV8fV4f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_insert_128_lo_d, "V4dV4dV2d", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_insert_128_lo, "V32ScV32ScV16Sc", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_insert_128_hi_s, "V8fV8fV4f", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_insert_128_hi_d, "V4dV4dV2d", "nc", "lasx")
+TARGET_BUILTIN(__builtin_lasx_insert_128_hi, "V32ScV32ScV16Sc", "nc", "lasx")
@@ -10,6 +10,8 @@
 #ifndef _LOONGSON_ASXINTRIN_H
 #define _LOONGSON_ASXINTRIN_H 1
 
+#include <lsxintrin.h>
+
 #if defined(__loongarch_asx)
 
 typedef signed char v32i8 __attribute__((vector_size(32), aligned(32)));
@@ -3882,5 +3884,113 @@ extern __inline
 
 #define __lasx_xvrepli_w(/*si10*/ _1) ((__m256i)__builtin_lasx_xvrepli_w((_1)))
 
+extern __inline
+ __attribute__((__gnu_inline__, __always_inline__,
+ __artificial__)) __m256 __lasx_cast_128_s(__m128 _1) {
+ return (__m256)__builtin_lasx_cast_128_s((v4f32)_1);
+}
+
+extern __inline
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+ __lasx_cast_128_d(__m128d _1) {
+ return (__m256d)__builtin_lasx_cast_128_d((v2f64)_1);
+}
+
+extern __inline
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+ __lasx_cast_128(__m128i _1) {
+ return (__m256i)__builtin_lasx_cast_128((v16i8)_1);
+}
+
+extern __inline
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+ __lasx_concat_128_s(__m128 _1, __m128 _2) {
+ return (__m256)__builtin_lasx_concat_128_s((v4f32)_1, (v4f32)_2);
+}
+
+extern __inline
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+ __lasx_concat_128_d(__m128d _1, __m128d _2) {
+ return (__m256d)__builtin_lasx_concat_128_d((v2f64)_1, (v2f64)_2);
+}
+
+extern __inline
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+ __lasx_concat_128(__m128i _1, __m128i _2) {
+ return (__m256i)__builtin_lasx_concat_128((v16i8)_1, (v16i8)_2);
+}
+
+extern __inline
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+ __lasx_extract_128_lo_s(__m256 _1) {
+ return (__m128)__builtin_lasx_extract_128_lo_s((v8f32)_1);
+}
+
+extern __inline
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+ __lasx_extract_128_lo_d(__m256d _1) {
+ return (__m128d)__builtin_lasx_extract_128_lo_d((v4f64)_1);
+}
+
+extern __inline
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+ __lasx_extract_128_lo(__m256i _1) {
+ return (__m128i)__builtin_lasx_extract_128_lo((v32i8)_1);
+}
+
+extern __inline
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128
+ __lasx_extract_128_hi_s(__m256 _1) {
+ return (__m128)__builtin_lasx_extract_128_hi_s((v8f32)_1);
+}
+
+extern __inline
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128d
+ __lasx_extract_128_hi_d(__m256d _1) {
+ return (__m128d)__builtin_lasx_extract_128_hi_d((v4f64)_1);
+}
+
+extern __inline
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m128i
+ __lasx_extract_128_hi(__m256i _1) {
+ return (__m128i)__builtin_lasx_extract_128_hi((v32i8)_1);
+}
+
+extern __inline
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+ __lasx_insert_128_lo_s(__m256 _1, __m128 _2) {
+ return (__m256)__builtin_lasx_insert_128_lo_s((v8f32)_1, (v4f32)_2);
+}
+
+extern __inline
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+ __lasx_insert_128_lo_d(__m256d _1, __m128d _2) {
+ return (__m256d)__builtin_lasx_insert_128_lo_d((v4f64)_1, (v2f64)_2);
+}
+
+extern __inline
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+ __lasx_insert_128_lo(__m256i _1, __m128i _2) {
+ return (__m256i)__builtin_lasx_insert_128_lo((v32i8)_1, (v16i8)_2);
+}
+
+extern __inline
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256
+ __lasx_insert_128_hi_s(__m256 _1, __m128 _2) {
+ return (__m256)__builtin_lasx_insert_128_hi_s((v8f32)_1, (v4f32)_2);
+}
+
+extern __inline
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256d
+ __lasx_insert_128_hi_d(__m256d _1, __m128d _2) {
+ return (__m256d)__builtin_lasx_insert_128_hi_d((v4f64)_1, (v2f64)_2);
+}
+
+extern __inline
+ __attribute__((__gnu_inline__, __always_inline__, __artificial__)) __m256i
+ __lasx_insert_128_hi(__m256i _1, __m128i _2) {
+ return (__m256i)__builtin_lasx_insert_128_hi((v32i8)_1, (v16i8)_2);
+}
+
 #endif /* defined(__loongarch_asx). */
 #endif /* _LOONGSON_ASXINTRIN_H. */
@@ -6384,3 +6384,156 @@ v16i16 xvrepli_h() { return __lasx_xvrepli_h(1); }
 // CHECK-NEXT: ret void
 //
 v8i32 xvrepli_w() { return __lasx_xvrepli_w(1); }
+// CHECK-LABEL: @cast_128_s(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.cast.128.s(<4 x float> [[TMP0]])
+// CHECK-NEXT: store <8 x float> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT: ret void
+//
+v8f32 cast_128_s(v4f32 _1) { return __lasx_cast_128_s(_1); }
+// CHECK-LABEL: @cast_128_d(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.cast.128.d(<2 x double> [[TMP0]])
+// CHECK-NEXT: store <4 x double> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT: ret void
+//
+v4f64 cast_128_d(v2f64 _1) { return __lasx_cast_128_d(_1); }
+// CHECK-LABEL: @cast_128(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.cast.128(<16 x i8> [[TMP0]])
+// CHECK-NEXT: store <32 x i8> [[TMP1]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT: ret void
+//
+v32i8 cast_128(v16i8 _1) { return __lasx_cast_128(_1); }
+// CHECK-LABEL: @concat_128_s(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.concat.128.s(<4 x float> [[TMP0]], <4 x float> [[TMP1]])
+// CHECK-NEXT: store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT: ret void
+//
+v8f32 concat_128_s(v4f32 _1, v4f32 _2) { return __lasx_concat_128_s(_1, _2); }
+// CHECK-LABEL: @concat_128_d(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.concat.128.d(<2 x double> [[TMP0]], <2 x double> [[TMP1]])
+// CHECK-NEXT: store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT: ret void
+//
+v4f64 concat_128_d(v2f64 _1, v2f64 _2) { return __lasx_concat_128_d(_1, _2); }
+// CHECK-LABEL: @concat_128(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[TMP0:%.*]] = bitcast i128 [[_1_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.concat.128(<16 x i8> [[TMP0]], <16 x i8> [[TMP1]])
+// CHECK-NEXT: store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT: ret void
+//
+v32i8 concat_128(v16i8 _1, v16i8 _2) { return __lasx_concat_128(_1, _2); }
+// CHECK-LABEL: @extract_128_lo_s(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lasx.extract.128.lo.s(<8 x float> [[_1]])
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT: ret i128 [[TMP2]]
+//
+v4f32 extract_128_lo_s(v8f32 _1) { return __lasx_extract_128_lo_s(_1); }
+// CHECK-LABEL: @extract_128_lo_d(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lasx.extract.128.lo.d(<4 x double> [[_1]])
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT: ret i128 [[TMP2]]
+//
+v2f64 extract_128_lo_d(v4f64 _1) { return __lasx_extract_128_lo_d(_1); }
+// CHECK-LABEL: @extract_128_lo(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lasx.extract.128.lo(<32 x i8> [[_112]])
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT: ret i128 [[TMP2]]
+//
+v16i8 extract_128_lo(v32i8 _1) { return __lasx_extract_128_lo(_1); }
+// CHECK-LABEL: @extract_128_hi_s(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <4 x float> @llvm.loongarch.lasx.extract.128.hi.s(<8 x float> [[_1]])
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <4 x float> [[TMP1]] to i128
+// CHECK-NEXT: ret i128 [[TMP2]]
+//
+v4f32 extract_128_hi_s(v8f32 _1) { return __lasx_extract_128_hi_s(_1); }
+// CHECK-LABEL: @extract_128_hi_d(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <2 x double> @llvm.loongarch.lasx.extract.128.hi.d(<4 x double> [[_1]])
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <2 x double> [[TMP1]] to i128
+// CHECK-NEXT: ret i128 [[TMP2]]
+//
+v2f64 extract_128_hi_d(v4f64 _1) { return __lasx_extract_128_hi_d(_1); }
+// CHECK-LABEL: @extract_128_hi(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[_112:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT: [[TMP1:%.*]] = tail call <16 x i8> @llvm.loongarch.lasx.extract.128.hi(<32 x i8> [[_112]])
+// CHECK-NEXT: [[TMP2:%.*]] = bitcast <16 x i8> [[TMP1]] to i128
+// CHECK-NEXT: ret i128 [[TMP2]]
+//
+v16i8 extract_128_hi(v32i8 _1) { return __lasx_extract_128_hi(_1); }
+// CHECK-LABEL: @insert_128_lo_s(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.insert.128.lo.s(<8 x float> [[_1]], <4 x float> [[TMP1]])
+// CHECK-NEXT: store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT: ret void
+//
+v8f32 insert_128_lo_s(v8f32 _1, v4f32 _2) { return __lasx_insert_128_lo_s(_1, _2); }
+// CHECK-LABEL: @insert_128_lo_d(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.insert.128.lo.d(<4 x double> [[_1]], <2 x double> [[TMP1]])
+// CHECK-NEXT: store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT: ret void
+//
+v4f64 insert_128_lo_d(v4f64 _1, v2f64 _2) { return __lasx_insert_128_lo_d(_1, _2); }
+// CHECK-LABEL: @insert_128_lo(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[_123:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.insert.128.lo(<32 x i8> [[_123]], <16 x i8> [[TMP1]])
+// CHECK-NEXT: store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT: ret void
+//
+v32i8 insert_128_lo(v32i8 _1, v16i8 _2) { return __lasx_insert_128_lo(_1, _2); }
+// CHECK-LABEL: @insert_128_hi_s(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[_1:%.*]] = load <8 x float>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <4 x float>
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <8 x float> @llvm.loongarch.lasx.insert.128.hi.s(<8 x float> [[_1]], <4 x float> [[TMP1]])
+// CHECK-NEXT: store <8 x float> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT: ret void
+//
+v8f32 insert_128_hi_s(v8f32 _1, v4f32 _2) { return __lasx_insert_128_hi_s(_1, _2); }
+// CHECK-LABEL: @insert_128_hi_d(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[_1:%.*]] = load <4 x double>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <2 x double>
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <4 x double> @llvm.loongarch.lasx.insert.128.hi.d(<4 x double> [[_1]], <2 x double> [[TMP1]])
+// CHECK-NEXT: store <4 x double> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT: ret void
+//
+v4f64 insert_128_hi_d(v4f64 _1, v2f64 _2) { return __lasx_insert_128_hi_d(_1, _2); }
+// CHECK-LABEL: @insert_128_hi(
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[_123:%.*]] = load <32 x i8>, ptr [[TMP0:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast i128 [[_2_COERCE:%.*]] to <16 x i8>
+// CHECK-NEXT: [[TMP2:%.*]] = tail call <32 x i8> @llvm.loongarch.lasx.insert.128.hi(<32 x i8> [[_123]], <16 x i8> [[TMP1]])
+// CHECK-NEXT: store <32 x i8> [[TMP2]], ptr [[AGG_RESULT:%.*]], align 32, !tbaa [[TBAA2]]
+// CHECK-NEXT: ret void
+//
+v32i8 insert_128_hi(v32i8 _1, v16i8 _2) { return __lasx_insert_128_hi(_1, _2); }