[XLA:GPU] Move Dot strength reduction out of algebraic simplifier

mooskagh · tensorflower-gardener · commit 770e10ab5f01 · 2025-07-18T05:28:28.000-07:00
and run it only once.

The plan for the follow up changes is to remove vec×matrix reduction (currently regresses some models for unrelated reasons), and only keep vec×vec.

PiperOrigin-RevId: 784472699
diff --git a/third_party/xla/xla/service/gpu/BUILD b/third_party/xla/xla/service/gpu/BUILD
@@ -1023,6 +1023,7 @@ cc_library(
         "//xla:xla_data_proto_cc",
         "//xla/hlo/ir:hlo",
         "//xla/service:algorithm_util",
+        "//xla/service/gpu/transforms:dot_algorithm_rewriter",
         "//xla/stream_executor:blas",
         "//xla/stream_executor:device_description",
         "//xla/stream_executor:device_memory",
@@ -1037,7 +1038,6 @@ cc_library(
         "@com_google_absl//absl/status:statusor",
         "@com_google_absl//absl/strings",
         "@com_google_absl//absl/types:span",
-        "@llvm-project//llvm:Support",
         "@local_tsl//tsl/platform:errors",
         "@local_tsl//tsl/platform:statusor",
     ],
@@ -1609,6 +1609,7 @@ cc_library(
         "//xla/service/gpu/transforms:dot_dimension_sorter",
         "//xla/service/gpu/transforms:dot_normalizer",
         "//xla/service/gpu/transforms:dot_operand_converter",
+        "//xla/service/gpu/transforms:dot_strength_reduction",
         "//xla/service/gpu/transforms:double_buffer_loop_unrolling",
         "//xla/service/gpu/transforms:dynamic_slice_fusion_rewriter",
         "//xla/service/gpu/transforms:explicit_collectives_group_async_wrapper",
diff --git a/third_party/xla/xla/service/gpu/gpu_compiler.cc b/third_party/xla/xla/service/gpu/gpu_compiler.cc
@@ -213,6 +213,7 @@ limitations under the License.
 #include "xla/service/gpu/transforms/dot_dimension_sorter.h"
 #include "xla/service/gpu/transforms/dot_normalizer.h"
 #include "xla/service/gpu/transforms/dot_operand_converter.h"
+#include "xla/service/gpu/transforms/dot_strength_reduction.h"
 #include "xla/service/gpu/transforms/double_buffer_loop_unrolling.h"
 #include "xla/service/gpu/transforms/dynamic_slice_fusion_rewriter.h"
 #include "xla/service/gpu/transforms/explicit_collectives_group_async_wrapper.h"
@@ -849,6 +850,8 @@ absl::Status RunOptimizationPasses(
     pipeline.AddPass<ScatterExpander>(
         ScatterExpander::kEliminateSimpleScatters);
     pipeline.AddPass<ScatterSliceSimplifier>();
+    pipeline.AddPass<DotStrengthReduction>(
+        gpu_target_config.device_description.gpu_compute_capability());
     pipeline.AddPass<GpuAlgebraicSimplifier>(layout_insensitive_algsimp_opts,
                                              gpu_version);
     pipeline.AddPass<BitcastDtypesExpander>();
@@ -1348,7 +1351,7 @@ AlgebraicSimplifierOptions GpuCompiler::GetAlgebraicSimplifierOptions(
     bool is_rocm) {
   AlgebraicSimplifierOptions opts;
 
-  opts.set_enable_dot_strength_reduction(true);
+  opts.set_enable_dot_strength_reduction(false);
   // On GPU it helps to reorder them so that the fused cuDNN kernel can be
   // used.
   opts.set_enable_conv_add_multiply_reorder(true);
diff --git a/third_party/xla/xla/service/gpu/ir_emission_utils.cc b/third_party/xla/xla/service/gpu/ir_emission_utils.cc
@@ -81,7 +81,7 @@ absl::StatusOr<bool> IsCublasSupportedMatMul(
   int num_matrix_operands = 0;
   for (int operand : {0, 1}) {
     TF_ASSIGN_OR_RETURN(DotOperandDims dims,
-                        DotOperandDims::FromDot(&dot, operand));
+                        DotOperandDims::FromDotOperand(&dot, operand));
     // cuBLAS only supports single contracting dimension.
     if (dims.DimensionCount(DotOperandDims::kContracting) != 1) {
       return false;
diff --git a/third_party/xla/xla/service/gpu/matmul_indexing_utils.cc b/third_party/xla/xla/service/gpu/matmul_indexing_utils.cc
@@ -15,6 +15,7 @@ limitations under the License.
 
 #include "xla/service/gpu/matmul_indexing_utils.h"
 
+#include <array>
 #include <cstdint>
 #include <iterator>
 #include <vector>
@@ -101,7 +102,14 @@ DotOperandDims::DotOperandDims(Shape shape,
                                     contracting_dims.end());
 }
 
-absl::StatusOr<DotOperandDims> DotOperandDims::FromDot(
+absl::StatusOr<std::array<DotOperandDims, 2>> DotOperandDims::FromDot(
+    const HloInstruction* dot) {
+  TF_ASSIGN_OR_RETURN(auto lhs_dims, FromDotOperand(dot, 0));
+  TF_ASSIGN_OR_RETURN(auto rhs_dims, FromDotOperand(dot, 1));
+  return std::array<DotOperandDims, 2>{lhs_dims, rhs_dims};
+}
+
+absl::StatusOr<DotOperandDims> DotOperandDims::FromDotOperand(
     const HloInstruction* dot, int operand_idx) {
   TF_RET_CHECK(operand_idx == 0 || operand_idx == 1);
   const Shape& shape = dot->operand(operand_idx)->shape();
diff --git a/third_party/xla/xla/service/gpu/matmul_indexing_utils.h b/third_party/xla/xla/service/gpu/matmul_indexing_utils.h
@@ -63,9 +63,12 @@ class DotOperandDims {
                  absl::Span<const int64_t> contracting_dims);
 
   enum Category { kBatch, kNonContracting, kContracting };
+  // Creates a DotOperandDims from a dot instruction.
+  static absl::StatusOr<std::array<DotOperandDims, 2>> FromDot(
+      const HloInstruction* dot);
   // Creates a DotOperandDims from a dot instruction and operand index (0 or 1).
-  static absl::StatusOr<DotOperandDims> FromDot(const HloInstruction* dot,
-                                                int operand_idx);
+  static absl::StatusOr<DotOperandDims> FromDotOperand(
+      const HloInstruction* dot, int operand_idx);
   // Converts two DotOperandDims to a DotDimensionNumbers.
   static absl::StatusOr<DotDimensionNumbers> IntoDotDimensionNumbers(
       const DotOperandDims& lhs_dims, const DotOperandDims& rhs_dims);
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.cc b/third_party/xla/xla/service/gpu/matmul_utils.cc
@@ -40,6 +40,7 @@ limitations under the License.
 #include "xla/service/algorithm_util.h"
 #include "xla/service/gpu/backend_configs.pb.h"
 #include "xla/service/gpu/matmul_indexing_utils.h"
+#include "xla/service/gpu/transforms/dot_algorithm_rewriter.h"
 #include "xla/shape.h"
 #include "xla/shape_util.h"
 #include "xla/status_macros.h"
@@ -895,17 +896,40 @@ PrimitiveType GetGemmAccumulatorType(HloDotInstruction* dot) {
   if (accumulator_type.ok()) {
     return accumulator_type.value();
   }
-  // Otherwise, return the default accumulator type for the output type.
-  PrimitiveType output_type = dot->shape().element_type();
-  switch (output_type) {
-    case PrimitiveType::F16:
-    case PrimitiveType::BF16:
-      return PrimitiveType::F32;
-    case PrimitiveType::F32:
-    case PrimitiveType::F64:
-    case PrimitiveType::S32:
+
+  PrimitiveType shape_type = dot->shape().element_type();
+  // If the output type is a floating point type with less than or equal to 32
+  // bits, use f32 as the accumulator type.
+  if (primitive_util::IsFloatingPointType(shape_type) &&
+      primitive_util::BitWidth(shape_type) <= primitive_util::BitWidth(F32)) {
+    return F32;
+  }
+  return shape_type;
+}
+
+absl::StatusOr<HloInstruction*> MakeMultiplyForDotPrecisionAlgorithm(
+    HloInstruction* lhs, HloInstruction* rhs,
+    const PrecisionConfig::Algorithm& algorithm) {
+  switch (algorithm) {
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32:
+      return DotAlgorithmRewriter::MakeMultiplyForBF16BF16F32(lhs, rhs);
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3:
+      return DotAlgorithmRewriter::MakeMultiplyForBF16BF16F32X3(lhs, rhs);
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6:
+      return DotAlgorithmRewriter::MakeMultiplyForBF16BF16F32X6(lhs, rhs);
+    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9:
+      return DotAlgorithmRewriter::MakeMultiplyForBF16BF16F32X9(lhs, rhs);
+    case PrecisionConfig::ALG_DOT_TF32_TF32_F32:
+      return DotAlgorithmRewriter::MakeMultiplyForTF32TF32F32(lhs, rhs);
+    case PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3:
+      return DotAlgorithmRewriter::MakeMultiplyForTF32TF32F32X3(lhs, rhs);
+    case PrecisionConfig::ALG_DOT_F32_F32_F32:
+    case PrecisionConfig::ALG_UNSET:
+      return lhs->parent()->AddInstruction(HloInstruction::CreateBinary(
+          lhs->shape(), HloOpcode::kMultiply, lhs, rhs));
     default:
-      return output_type;
+      return absl::InvalidArgumentError(
+          absl::StrCat("Unsupported dot precision algorithm: ", algorithm));
   }
 }
 
diff --git a/third_party/xla/xla/service/gpu/matmul_utils.h b/third_party/xla/xla/service/gpu/matmul_utils.h
@@ -63,6 +63,13 @@ bool IsDotSupportedByClassicalEmitters(const HloInstruction& dot);
 // from the dot algorithm or inferred from the output type).
 PrimitiveType GetGemmAccumulatorType(HloDotInstruction* dot);
 
+// Makes algorithm specific set of instructions which would multiply lhs and rhs
+// like the dot with the given precision algorithm would. Useful e.g. rewriting
+// dot as multiply+reduce.
+absl::StatusOr<HloInstruction*> MakeMultiplyForDotPrecisionAlgorithm(
+    HloInstruction* lhs, HloInstruction* rhs,
+    const PrecisionConfig::Algorithm& algorithm);
+
 // extending plain MatrixLayout struct with creator functions
 struct MatrixLayout : public se::gpu::MatrixLayout {
   // Returns the matrix layout for a logical shape (batch, rows, columns).
diff --git a/third_party/xla/xla/service/gpu/transforms/BUILD b/third_party/xla/xla/service/gpu/transforms/BUILD
@@ -87,11 +87,11 @@ xla_cc_test(
         "//xla/hlo/transforms/simplifiers:algebraic_simplifier",
         "//xla/service:pattern_matcher",
         "//xla/stream_executor:device_description",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
         "//xla/tests:xla_internal_test_main",
         "//xla/tsl/platform:statusor",
         "@com_google_absl//absl/strings:string_view",
         "@com_google_googletest//:gtest",
-        "@local_tsl//tsl/platform:statusor",
     ],
 )
 
@@ -1182,6 +1182,48 @@ xla_test(
     ),
 )
 
+cc_library(
+    name = "dot_strength_reduction",
+    srcs = ["dot_strength_reduction.cc"],
+    hdrs = ["dot_strength_reduction.h"],
+    deps = [
+        ":dot_algorithm_rewriter",
+        "//xla:literal",
+        "//xla:literal_util",
+        "//xla:shape_util",
+        "//xla/backends/gpu/codegen/triton:support",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/transforms/expanders:op_expander_pass",
+        "//xla/service/gpu:matmul_indexing_utils",
+        "//xla/service/gpu:matmul_utils",
+        "//xla/stream_executor:device_description",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/algorithm:container",
+        "@com_google_absl//absl/log:check",
+        "@com_google_absl//absl/status:statusor",
+        "@com_google_absl//absl/strings:string_view",
+        "@com_google_absl//absl/types:span",
+    ],
+)
+
+xla_test(
+    name = "dot_strength_reduction_test",
+    srcs = ["dot_strength_reduction_test.cc"],
+    backends = ["gpu"],
+    deps = [
+        ":dot_strength_reduction",
+        "//xla/hlo/ir:hlo",
+        "//xla/hlo/testlib:filecheck",
+        "//xla/hlo/testlib:hlo_hardware_independent_test_base",
+        "//xla/hlo/testlib:verified_hlo_module",
+        "//xla/stream_executor:device_description",
+        "//xla/stream_executor/cuda:cuda_compute_capability",
+        "//xla/tsl/platform:statusor",
+        "@com_google_absl//absl/log:check",
+        "@com_google_googletest//:gtest_main",
+    ],
+)
+
 cc_library(
     name = "double_buffer_loop_unrolling",
     srcs = ["double_buffer_loop_unrolling.cc"],
diff --git a/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.cc b/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.cc
@@ -100,65 +100,8 @@ bool GpuAlgebraicSimplifierVisitor::SupportedDotPrecisionConfig(
 absl::StatusOr<HloInstruction*>
 GpuAlgebraicSimplifierVisitor::MakeMultiplyForPrecisionAlgorithm(
     HloInstruction* dot, HloInstruction* lhs, HloInstruction* rhs) {
-  const auto algorithm = dot->precision_config().algorithm();
-  switch (algorithm) {
-    case PrecisionConfig::ALG_DOT_BF16_BF16_F32:
-      return DotAlgorithmRewriter::MakeMultiplyForBF16BF16F32(lhs, rhs);
-    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X3:
-      return DotAlgorithmRewriter::MakeMultiplyForBF16BF16F32X3(lhs, rhs);
-    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X6:
-      return DotAlgorithmRewriter::MakeMultiplyForBF16BF16F32X6(lhs, rhs);
-    case PrecisionConfig::ALG_DOT_BF16_BF16_F32_X9:
-      return DotAlgorithmRewriter::MakeMultiplyForBF16BF16F32X9(lhs, rhs);
-    case PrecisionConfig::ALG_DOT_TF32_TF32_F32:
-      return DotAlgorithmRewriter::MakeMultiplyForTF32TF32F32(lhs, rhs);
-    case PrecisionConfig::ALG_DOT_TF32_TF32_F32_X3:
-      return DotAlgorithmRewriter::MakeMultiplyForTF32TF32F32X3(lhs, rhs);
-    case PrecisionConfig::ALG_DOT_F32_F32_F32:
-      return MakeBinaryHlo(HloOpcode::kMultiply, lhs, rhs);
-    case PrecisionConfig::ALG_UNSET:
-      return MakeBinaryHlo(HloOpcode::kMultiply, lhs, rhs);
-    default:
-      CHECK(false) << "Unsupported dot precision algorithm: " << algorithm;
-  }
-}
-
-bool GpuAlgebraicSimplifierVisitor::ShouldStrengthReduceDotToReduce(
-    const HloInstruction* hlo) {
-  if (!options_.enable_dot_strength_reduction()) {
-    return false;
-  }
-
-  const HloDotInstruction* dot = DynCast<HloDotInstruction>(hlo);
-  if (dot == nullptr) {
-    return false;
-  }
-
-  const HloInstruction* lhs = dot->operand(0);
-  const HloInstruction* rhs = dot->operand(1);
-  DotDimensionNumbers dnums = dot->dot_dimension_numbers();
-  bool lhs_is_vector = (dnums.lhs_batch_dimensions_size() +
-                            dnums.lhs_contracting_dimensions_size() ==
-                        lhs->shape().dimensions().size());
-  bool rhs_is_vector = (dnums.rhs_batch_dimensions_size() +
-                            dnums.rhs_contracting_dimensions_size() ==
-                        rhs->shape().dimensions().size());
-  // Strength-reduce vector-vector dots since they are not supported by
-  // GemmFusion.
-  if (lhs_is_vector && rhs_is_vector) {
-    return true;
-  }
-
-  absl::StatusOr<bool> is_too_small =
-      IsMatrixMultiplicationTooSmallForRewriting(*hlo, /*threshold=*/10000000);
-  CHECK_OK(is_too_small.status());
-  if (is_too_small.value()) {
-    return true;
-  }
-
-  // If GemmFusion cannot handle this dot, we should strength-reduce it so that
-  // it can be handled by the fusion pipeline.
-  return !legacy_triton::CanTritonHandleGEMM(*dot, compute_capability_);
+  return MakeMultiplyForDotPrecisionAlgorithm(
+      lhs, rhs, dot->precision_config().algorithm());
 }
 
 }  // namespace xla::gpu
diff --git a/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.h b/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier.h
@@ -41,8 +41,6 @@ class GpuAlgebraicSimplifierVisitor : public AlgebraicSimplifierVisitor {
 
   absl::Status HandleAdd(HloInstruction* add) override;
 
-  bool ShouldStrengthReduceDotToReduce(const HloInstruction* hlo) override;
-
  private:
   // Returns true if the dot precision config is supported by simplifier.
   bool SupportedDotPrecisionConfig(const PrecisionConfig& config,
diff --git a/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier_test.cc b/third_party/xla/xla/service/gpu/transforms/algebraic_simplifier_test.cc
diff --git a/third_party/xla/xla/service/gpu/transforms/dot_strength_reduction.cc b/third_party/xla/xla/service/gpu/transforms/dot_strength_reduction.cc
diff --git a/third_party/xla/xla/service/gpu/transforms/dot_strength_reduction.h b/third_party/xla/xla/service/gpu/transforms/dot_strength_reduction.h
diff --git a/third_party/xla/xla/service/gpu/transforms/dot_strength_reduction_test.cc b/third_party/xla/xla/service/gpu/transforms/dot_strength_reduction_test.cc