llvm · FreddyLeaf · Apr 29, 2024
diff --git a/llvm/docs/LangRef.rst b/llvm/docs/LangRef.rst
@@ -2408,6 +2408,17 @@ example:
  attempt is made to diagnose unsupported uses. Currently this
  attribute is respected by the AMDGPU and NVPTX backends.
 
+``"denormal-fp-math-bf16"``
+ Same as ``"denormal-fp-math"``, but only controls the behavior of
+ the Brain Float16 type (or vectors of Brain Float16). If both are
+ are present, this overrides ``"denormal-fp-math"``. Not all targets
+ support separately setting the denormal mode per type, and no
+ attempt is made to diagnose unsupported uses. Currently this
+ attribute is respected by the X86 backend.
+
+ If this is attribute is not specified, the default is
+ ``"preserve-sign,preserve-sign"``.
+
 ``"thunk"``
  This attribute indicates that the function will delegate to some other
  function with a tail call. The prototype of a thunk should not be used for

diff --git a/llvm/include/llvm/CodeGen/CommandFlags.h b/llvm/include/llvm/CodeGen/CommandFlags.h
@@ -71,6 +71,7 @@ bool getEnableNoTrappingFPMath();
 
 DenormalMode::DenormalModeKind getDenormalFPMath();
 DenormalMode::DenormalModeKind getDenormalFP32Math();
+DenormalMode::DenormalModeKind getDenormalBF16Math();
 
 bool getEnableHonorSignDependentRoundingFPMath();
 

diff --git a/llvm/lib/CodeGen/CommandFlags.cpp b/llvm/lib/CodeGen/CommandFlags.cpp
@@ -73,6 +73,7 @@ CGOPT(bool, EnableNoTrappingFPMath)
 CGOPT(bool, EnableAIXExtendedAltivecABI)
 CGOPT(DenormalMode::DenormalModeKind, DenormalFPMath)
 CGOPT(DenormalMode::DenormalModeKind, DenormalFP32Math)
+CGOPT(DenormalMode::DenormalModeKind, DenormalBF16Math)
 CGOPT(bool, EnableHonorSignDependentRoundingFPMath)
 CGOPT(FloatABI::ABIType, FloatABIForCalls)
 CGOPT(FPOpFusion::FPOpFusionMode, FuseFPOps)
@@ -277,6 +278,13 @@ codegen::RegisterCodeGenFlags::RegisterCodeGenFlags() {
  DenormFlagEnumOptions);
  CGBINDOPT(DenormalFP32Math);
 
+ static cl::opt<DenormalMode::DenormalModeKind> DenormalBF16Math(
+ "denormal-fp-math-bf16",
+ cl::desc("Select which denormal numbers the code is permitted to require "
+ "for bfloat"),
+ cl::init(DenormalMode::PreserveSign), DenormFlagEnumOptions);
+ CGBINDOPT(DenormalBF16Math);
+
  static cl::opt<bool> EnableHonorSignDependentRoundingFPMath(
  "enable-sign-dependent-rounding-fp-math", cl::Hidden,
  cl::desc("Force codegen to assume rounding mode can change dynamically"),
@@ -719,6 +727,14 @@ void codegen::setFunctionAttributes(StringRef CPU, StringRef Features,
  DenormalMode(DenormKind, DenormKind).str());
  }
 
+ if (DenormalBF16MathView->getNumOccurrences() > 0 &&
+ !F.hasFnAttribute("denormal-fp-math-bf16")) {
+ // FIXME: Command line flag should expose separate input/output modes.
+ DenormalMode::DenormalModeKind DenormKind = getDenormalBF16Math();
+ NewAttrs.addAttribute("denormal-fp-math-bf16",
+ DenormalMode(DenormKind, DenormKind).str());
+ }
+
  if (TrapFuncNameView->getNumOccurrences() > 0)
  for (auto &B : F)
  for (auto &I : B)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2283,7 +2283,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
  }
  }
 
- if (!Subtarget.useSoftFloat() &&
+ if (!Subtarget.useSoftFloat() && Subtarget.getDenormalMathFTZDAZBF16() &&
  (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16())) {
  addRegisterClass(MVT::v8bf16, Subtarget.hasAVX512() ? &X86::VR128XRegClass
  : &X86::VR128RegClass);
@@ -8740,6 +8740,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
  return LowerBUILD_VECTORvXi1(Op, dl, DAG, Subtarget);
 
  if (VT.getVectorElementType() == MVT::bf16 &&
+ Subtarget.getDenormalMathFTZDAZBF16() &&
  (Subtarget.hasAVXNECONVERT() || Subtarget.hasBF16()))
  return LowerBUILD_VECTORvXbf16(Op, DAG, Subtarget);
 
@@ -21536,6 +21537,7 @@ SDValue X86TargetLowering::LowerFP_ROUND(SDValue Op, SelectionDAG &DAG) const {
 
  if (VT.getScalarType() == MVT::bf16) {
  if (SVT.getScalarType() == MVT::f32 &&
+ Subtarget.getDenormalMathFTZDAZBF16() &&
  ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
  Subtarget.hasAVXNECONVERT()))
  return Op;
@@ -21644,8 +21646,9 @@ SDValue X86TargetLowering::LowerFP_TO_BF16(SDValue Op,
  SDLoc DL(Op);
 
  MVT SVT = Op.getOperand(0).getSimpleValueType();
- if (SVT == MVT::f32 && ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
- Subtarget.hasAVXNECONVERT())) {
+ if (SVT == MVT::f32 && Subtarget.getDenormalMathFTZDAZBF16() &&
+ ((Subtarget.hasBF16() && Subtarget.hasVLX()) ||
+ Subtarget.hasAVXNECONVERT())) {
  SDValue Res;
  Res = DAG.getNode(ISD::SCALAR_TO_VECTOR, DL, MVT::v4f32, Op.getOperand(0));
  Res = DAG.getNode(X86ISD::CVTNEPS2BF16, DL, MVT::v8bf16, Res);

diff --git a/llvm/lib/Target/X86/X86Subtarget.cpp b/llvm/lib/Target/X86/X86Subtarget.cpp
@@ -324,12 +324,14 @@ X86Subtarget::X86Subtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU,
  StringRef FS, const X86TargetMachine &TM,
  MaybeAlign StackAlignOverride,
  unsigned PreferVectorWidthOverride,
- unsigned RequiredVectorWidth)
+ unsigned RequiredVectorWidth,
+ bool DenormalMathFTZDAZBF16)
  : X86GenSubtargetInfo(TT, CPU, TuneCPU, FS),
  PICStyle(PICStyles::Style::None), TM(TM), TargetTriple(TT),
  StackAlignOverride(StackAlignOverride),
  PreferVectorWidthOverride(PreferVectorWidthOverride),
  RequiredVectorWidth(RequiredVectorWidth),
+ DenormalMathFTZDAZBF16(DenormalMathFTZDAZBF16),
  InstrInfo(initializeSubtargetDependencies(CPU, TuneCPU, FS)),
  TLInfo(TM, *this), FrameLowering(*this, getStackAlignment()) {
  // Determine the PICStyle based on the target selected.

diff --git a/llvm/lib/Target/X86/X86Subtarget.h b/llvm/lib/Target/X86/X86Subtarget.h
@@ -106,6 +106,9 @@ class X86Subtarget final : public X86GenSubtargetInfo {
  /// Required vector width from function attribute.
  unsigned RequiredVectorWidth;
 
+ /// Denormal math for bfloat from function attribute.
+ bool DenormalMathFTZDAZBF16 = false;
+
  X86SelectionDAGInfo TSInfo;
  // Ordering here is important. X86InstrInfo initializes X86RegisterInfo which
  // X86TargetLowering needs.
@@ -119,8 +122,8 @@ class X86Subtarget final : public X86GenSubtargetInfo {
  ///
  X86Subtarget(const Triple &TT, StringRef CPU, StringRef TuneCPU, StringRef FS,
  const X86TargetMachine &TM, MaybeAlign StackAlignOverride,
- unsigned PreferVectorWidthOverride,
- unsigned RequiredVectorWidth);
+ unsigned PreferVectorWidthOverride, unsigned RequiredVectorWidth,
+ bool DenormalMathFTZDAZBF16);
 
  const X86TargetLowering *getTargetLowering() const override {
  return &TLInfo;
@@ -238,6 +241,7 @@ class X86Subtarget final : public X86GenSubtargetInfo {
 
  unsigned getPreferVectorWidth() const { return PreferVectorWidth; }
  unsigned getRequiredVectorWidth() const { return RequiredVectorWidth; }
+ bool getDenormalMathFTZDAZBF16() const { return DenormalMathFTZDAZBF16; }
 
  // Helper functions to determine when we should allow widening to 512-bit
  // during codegen.

diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -304,6 +304,15 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
  }
  }
 
+ // Extract denormal-fp-math-bf16 attribute.
+ bool DenormalMathFTZDAZBF16 = true;
+ Attribute DenormalBF16MathAttr = F.getFnAttribute("denormal-fp-math-bf16");
+ if (DenormalBF16MathAttr.isValid()) {
+ StringRef Val = DenormalBF16MathAttr.getValueAsString();
+ if (Val != "" && Val != "preserve-sign,preserve-sign")
+ DenormalMathFTZDAZBF16 = false;
+ }
+
  // Add CPU to the Key.
  Key += CPU;
 
@@ -339,7 +348,7 @@ X86TargetMachine::getSubtargetImpl(const Function &F) const {
  I = std::make_unique<X86Subtarget>(
  TargetTriple, CPU, TuneCPU, FS, *this,
  MaybeAlign(F.getParent()->getOverrideStackAlignment()),
- PreferVectorWidthOverride, RequiredVectorWidth);
+ PreferVectorWidthOverride, RequiredVectorWidth, DenormalMathFTZDAZBF16);
  }
  return I.get();
 }

diff --git a/llvm/test/CodeGen/X86/bfloat-ftz-daz.ll b/llvm/test/CodeGen/X86/bfloat-ftz-daz.ll
@@ -0,0 +1,78 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=avxneconvert | FileCheck %s --check-prefixes=FTZDAZ
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -denormal-fp-math-bf16=ieee -mattr=avxneconvert | FileCheck %s --check-prefixes=NOFTZDAZ
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -denormal-fp-math-bf16=preserve-sign -mattr=avxneconvert | FileCheck %s --check-prefixes=FTZDAZ
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -denormal-fp-math=ieee -mattr=avxneconvert | FileCheck %s --check-prefixes=FTZDAZ
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -denormal-fp-math=ieee -denormal-fp-math-bf16=ieee -mattr=avxneconvert | FileCheck %s --check-prefixes=NOFTZDAZ
+
+define void @add_default_attr(ptr %pa, ptr %pb, ptr %pc) nounwind {
+; FTZDAZ-LABEL: add_default_attr:
+; FTZDAZ: # %bb.0:
+; FTZDAZ-NEXT: movzwl (%rsi), %eax
+; FTZDAZ-NEXT: shll $16, %eax
+; FTZDAZ-NEXT: vmovd %eax, %xmm0
+; FTZDAZ-NEXT: movzwl (%rdi), %eax
+; FTZDAZ-NEXT: shll $16, %eax
+; FTZDAZ-NEXT: vmovd %eax, %xmm1
+; FTZDAZ-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; FTZDAZ-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
+; FTZDAZ-NEXT: vpextrw $0, %xmm0, (%rdx)
+; FTZDAZ-NEXT: retq
+;
+; NOFTZDAZ-LABEL: add_default_attr:
+; NOFTZDAZ: # %bb.0:
+; NOFTZDAZ-NEXT: pushq %rbx
+; NOFTZDAZ-NEXT: movq %rdx, %rbx
+; NOFTZDAZ-NEXT: movzwl (%rsi), %eax
+; NOFTZDAZ-NEXT: shll $16, %eax
+; NOFTZDAZ-NEXT: vmovd %eax, %xmm0
+; NOFTZDAZ-NEXT: movzwl (%rdi), %eax
+; NOFTZDAZ-NEXT: shll $16, %eax
+; NOFTZDAZ-NEXT: vmovd %eax, %xmm1
+; NOFTZDAZ-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; NOFTZDAZ-NEXT: callq __truncsfbf2@PLT
+; NOFTZDAZ-NEXT: vpextrw $0, %xmm0, (%rbx)
+; NOFTZDAZ-NEXT: popq %rbx
+; NOFTZDAZ-NEXT: retq
+ %a = load bfloat, ptr %pa
+ %b = load bfloat, ptr %pb
+ %add = fadd bfloat %a, %b
+ store bfloat %add, ptr %pc
+ ret void
+}
+
+define void @add_no_ftz_daz_attr(ptr %pa, ptr %pb, ptr %pc) nounwind "denormal-fp-math-bf16"="ieee,ieee" {
+; FTZDAZ-LABEL: add_no_ftz_daz_attr:
+; FTZDAZ: # %bb.0:
+; FTZDAZ-NEXT: movzwl (%rsi), %eax
+; FTZDAZ-NEXT: shll $16, %eax
+; FTZDAZ-NEXT: vmovd %eax, %xmm0
+; FTZDAZ-NEXT: movzwl (%rdi), %eax
+; FTZDAZ-NEXT: shll $16, %eax
+; FTZDAZ-NEXT: vmovd %eax, %xmm1
+; FTZDAZ-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; FTZDAZ-NEXT: {vex} vcvtneps2bf16 %xmm0, %xmm0
+; FTZDAZ-NEXT: vpextrw $0, %xmm0, (%rdx)
+; FTZDAZ-NEXT: retq
+;
+; NOFTZDAZ-LABEL: add_no_ftz_daz_attr:
+; NOFTZDAZ: # %bb.0:
+; NOFTZDAZ-NEXT: pushq %rbx
+; NOFTZDAZ-NEXT: movq %rdx, %rbx
+; NOFTZDAZ-NEXT: movzwl (%rsi), %eax
+; NOFTZDAZ-NEXT: shll $16, %eax
+; NOFTZDAZ-NEXT: vmovd %eax, %xmm0
+; NOFTZDAZ-NEXT: movzwl (%rdi), %eax
+; NOFTZDAZ-NEXT: shll $16, %eax
+; NOFTZDAZ-NEXT: vmovd %eax, %xmm1
+; NOFTZDAZ-NEXT: vaddss %xmm0, %xmm1, %xmm0
+; NOFTZDAZ-NEXT: callq __truncsfbf2@PLT
+; NOFTZDAZ-NEXT: vpextrw $0, %xmm0, (%rbx)
+; NOFTZDAZ-NEXT: popq %rbx
+; NOFTZDAZ-NEXT: retq
+ %a = load bfloat, ptr %pa
+ %b = load bfloat, ptr %pb
+ %add = fadd bfloat %a, %b
+ store bfloat %add, ptr %pc
+ ret void
+}
diff --git a/llvm/test/Other/opt-override-denormal-fp-math-bf16.ll b/llvm/test/Other/opt-override-denormal-fp-math-bf16.ll
@@ -0,0 +1,23 @@
+; RUN: opt -S -denormal-fp-math-bf16=ieee %s | FileCheck -check-prefixes=IEEE,ALL %s
+; RUN: opt -S -denormal-fp-math-bf16=preserve-sign %s | FileCheck -check-prefixes=PRESERVESIGN,ALL %s
+; RUN: opt -S -denormal-fp-math-bf16=positive-zero %s | FileCheck -check-prefixes=POSITIVEZERO,ALL %s
+
+; ALL: @no_denormal_fp_math_f32_attr() [[NOATTR:#[0-9]+]] {
+define i32 @no_denormal_fp_math_f32_attr() #0 {
+entry:
+ ret i32 0
+}
+
+; ALL: denormal_fp_math_attr_preserve_sign_ieee() [[ATTR:#[0-9]+]] {
+define i32 @denormal_fp_math_attr_preserve_sign_ieee() #1 {
+entry:
+ ret i32 0
+}
+
+; ALL-DAG: attributes [[ATTR]] = { nounwind "denormal-fp-math-bf16"="preserve-sign,ieee" }
+; IEEE-DAG: attributes [[NOATTR]] = { nounwind "denormal-fp-math-bf16"="ieee,ieee" }
+; PRESERVESIGN-DAG: attributes [[NOATTR]] = { nounwind "denormal-fp-math-bf16"="preserve-sign,preserve-sign" }
+; POSITIVEZERO-DAG: attributes [[NOATTR]] = { nounwind "denormal-fp-math-bf16"="positive-zero,positive-zero" }
+
+attributes #0 = { nounwind }
+attributes #1 = { nounwind "denormal-fp-math-bf16"="preserve-sign,ieee" }
diff --git a/llvm/test/Other/opt-override-denormal-fp-math-mixed.ll b/llvm/test/Other/opt-override-denormal-fp-math-mixed.ll
@@ -6,11 +6,17 @@
 ; RUN: opt -S -denormal-fp-math-f32=preserve-sign %s | FileCheck -check-prefixes=PRESERVESIGNF32,ALL %s
 ; RUN: opt -S -denormal-fp-math-f32=positive-zero %s | FileCheck -check-prefixes=POSITIVEZEROF32,ALL %s
 
+; RUN: opt -S -denormal-fp-math-bf16=ieee %s | FileCheck -check-prefixes=IEEEBF16,ALL %s
+; RUN: opt -S -denormal-fp-math-bf16=preserve-sign %s | FileCheck -check-prefixes=PRESERVESIGNBF16,ALL %s
+; RUN: opt -S -denormal-fp-math-bf16=positive-zero %s | FileCheck -check-prefixes=POSITIVEZEROBF16,ALL %s
+
 ; RUN: opt -S -denormal-fp-math=ieee -denormal-fp-math-f32=ieee %s | FileCheck -check-prefixes=IEEE-BOTH,ALL %s
 ; RUN: opt -S -denormal-fp-math=preserve-sign -denormal-fp-math-f32=preserve-sign %s | FileCheck -check-prefixes=PRESERVESIGN-BOTH,ALL %s
 ; RUN: opt -S -denormal-fp-math=positive-zero -denormal-fp-math-f32=positive-zero %s | FileCheck -check-prefixes=POSITIVEZERO-BOTH,ALL %s
 
-
+; RUN: opt -S -denormal-fp-math=ieee -denormal-fp-math-bf16=ieee %s | FileCheck -check-prefixes=IEEE-BOTH2,ALL %s
+; RUN: opt -S -denormal-fp-math=preserve-sign -denormal-fp-math-bf16=preserve-sign %s | FileCheck -check-prefixes=PRESERVESIGN-BOTH2,ALL %s
+; RUN: opt -S -denormal-fp-math=positive-zero -denormal-fp-math-bf16=positive-zero %s | FileCheck -check-prefixes=POSITIVEZERO-BOTH2,ALL %s
 
 ; ALL: @no_denormal_fp_math_attrs() [[NOATTR:#[0-9]+]] {
 define i32 @no_denormal_fp_math_attrs() #0 {
@@ -24,7 +30,7 @@ entry:
  ret i32 0
 }
 
-; ALL-DAG: attributes [[ATTR]] = { nounwind "denormal-fp-math"="preserve-sign,ieee" "denormal-fp-math-f32"="preserve-sign,ieee" }
+; ALL-DAG: attributes [[ATTR]] = { nounwind "denormal-fp-math"="preserve-sign,ieee" "denormal-fp-math-bf16"="preserve-sign,ieee" "denormal-fp-math-f32"="preserve-sign,ieee" }
 
 ; IEEE-DAG: attributes [[NOATTR]] = { nounwind "denormal-fp-math"="ieee,ieee" }
 ; PRESERVESIGN-DAG: attributes [[NOATTR]] = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" }
@@ -34,9 +40,17 @@ entry:
 ; PRESERVESIGNF32-DAG: attributes [[NOATTR]] = { nounwind "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 ; POSITIVEZEROF32-DAG: attributes [[NOATTR]] = { nounwind "denormal-fp-math-f32"="positive-zero,positive-zero" }
 
+; IEEEBF16-DAG: attributes [[NOATTR]] = { nounwind "denormal-fp-math-bf16"="ieee,ieee" }
+; PRESERVESIGNBF16-DAG: attributes [[NOATTR]] = { nounwind "denormal-fp-math-bf16"="preserve-sign,preserve-sign" }
+; POSITIVEZEROBF16-DAG: attributes [[NOATTR]] = { nounwind "denormal-fp-math-bf16"="positive-zero,positive-zero" }
+
 ; IEEE-BOTH-DAG: attributes [[NOATTR]] = { nounwind "denormal-fp-math"="ieee,ieee" "denormal-fp-math-f32"="ieee,ieee" }
 ; PRESERVESIGN-BOTH-DAG: attributes [[NOATTR]] = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-f32"="preserve-sign,preserve-sign" }
 ; POSITIVEZERO-BOTH-DAG: attributes [[NOATTR]] = { nounwind "denormal-fp-math"="positive-zero,positive-zero" "denormal-fp-math-f32"="positive-zero,positive-zero" }
 
+; IEEE-BOTH2-DAG: attributes [[NOATTR]] = { nounwind "denormal-fp-math"="ieee,ieee" "denormal-fp-math-bf16"="ieee,ieee" }
+; PRESERVESIGN-BOTH2-DAG: attributes [[NOATTR]] = { nounwind "denormal-fp-math"="preserve-sign,preserve-sign" "denormal-fp-math-bf16"="preserve-sign,preserve-sign" }
+; POSITIVEZERO-BOTH2-DAG: attributes [[NOATTR]] = { nounwind "denormal-fp-math"="positive-zero,positive-zero" "denormal-fp-math-bf16"="positive-zero,positive-zero" }
+
 attributes #0 = { nounwind }
-attributes #1 = { nounwind "denormal-fp-math"="preserve-sign,ieee" "denormal-fp-math-f32"="preserve-sign,ieee" }
+attributes #1 = { nounwind "denormal-fp-math"="preserve-sign,ieee" "denormal-fp-math-bf16"="preserve-sign,ieee" "denormal-fp-math-f32"="preserve-sign,ieee" }