flex attention: fix dispatch order for tensor subclasses, avoid hardcoding call to faketensor impl in dynamo

bdhirsh · bdhirsh · commit 577af2eee114 · 2025-05-02T08:15:43.000-07:00
ghstack-source-id: 17535e0 Pull Request resolved: #151719
diff --git a/torch/_dynamo/variables/higher_order_ops.py b/torch/_dynamo/variables/higher_order_ops.py
@@ -2636,7 +2636,8 @@ def call_function(
         args: "list[VariableTracker]",
         kwargs: "dict[str, VariableTracker]",
     ) -> "VariableTracker":
-        from torch._higher_order_ops.flex_attention import flex_attention_fake_impl
+        from torch._higher_order_ops.flex_attention import flex_attention
+        from . import TensorVariable
 
         from .builder import wrap_fx_proxy
 
@@ -2660,6 +2661,31 @@ def call_function(
             tx, query, mask_fn, "mask_fn"
         )
 
+        def unwrap_proxy_to_faketensor(x):
+            if isinstance(x, TupleVariable):
+                return pytree.tree_map(unwrap_proxy_to_faketensor, x.items)
+            if isinstance(x, (TensorVariable, SymNodeVariable)):
+                x_proxy = x.as_proxy()
+                return x_proxy.node.meta['example_value']
+            else:
+                return x.as_python_constant()
+
+        # use all of the args for faketensor prop
+        vt_full_args = [
+            query,
+            key,
+            value,
+            score_mod,
+            block_mask,
+            scale,
+            kernel_options,
+        ]
+        all_fake_args = pytree.tree_map(unwrap_proxy_to_faketensor, vt_full_args)
+
+        with torch._guards.TracingContext.try_get().fake_mode:
+            out_meta, lse_meta = flex_attention(*all_fake_args)
+        example_value = (out_meta, lse_meta)
+
         proxied_args = [
             query,
             key,
@@ -2674,12 +2700,6 @@ def call_function(
         # Proxying user defined functions is not supported.
         inp_args, _ = proxy_args_kwargs(proxied_args, {})
 
-        query_meta = query.as_proxy().node.meta["example_value"]
-        value_meta = value.as_proxy().node.meta["example_value"]
-        with torch._guards.TracingContext.try_get().fake_mode:
-            out_meta, lse_meta = flex_attention_fake_impl(query_meta, value_meta)
-        example_value = (out_meta, lse_meta)
-
         # Compose the ordered HOO args:
         # - inp_args: [query, key, value, block_mask, scale, kernel_options]
         # - subgraph node: [score_mod, mask_fn_node]
diff --git a/torch/_higher_order_ops/flex_attention.py b/torch/_higher_order_ops/flex_attention.py
@@ -17,7 +17,8 @@
     validate_subgraph_args_types,
 )
 from torch._ops import HigherOrderOperator
-from torch._subclasses import FakeTensorMode
+from torch._subclasses import FakeTensor, FakeTensorMode
+from torch._subclasses.functional_tensor import FunctionalTensor
 from torch.fx.experimental.proxy_tensor import (
     make_fx,
     ProxyTorchDispatchMode,
@@ -396,6 +397,29 @@ def flex_attention_functionalize(
     """
     from torch._dynamo._trace_wrapped_higher_order_op import TransformGetItemToIndex
 
+    flat_args, _ = pytree.tree_flatten(
+        (
+            query,
+            key,
+            value,
+            score_mod,
+            block_mask,
+            scale,
+            kernel_options,
+            score_mod_other_buffers,
+            mask_mod_other_buffers,
+        )
+    )
+    # For tensor subclasses, give the subclass a chance to run first
+    if any(
+        isinstance(a, torch.Tensor)
+        and type(a) is not torch.Tensor
+        and not isinstance(a, FakeTensor)
+        and not isinstance(a, FunctionalTensor)
+        for a in flat_args
+    ):
+        return NotImplemented
+
     query_unwrapped = ctx.unwrap_tensors(query)
     key_unwrapped = ctx.unwrap_tensors(key)
     value_unwrapped = ctx.unwrap_tensors(value)
@@ -473,6 +497,27 @@ def flex_attention_fake_tensor_mode(
     score_mod_other_buffers: tuple = (),
     mask_mod_other_buffers: tuple = (),
 ) -> tuple[torch.Tensor, torch.Tensor]:
+    flat_args, _ = pytree.tree_flatten(
+        (
+            query,
+            key,
+            value,
+            score_mod,
+            block_mask,
+            scale,
+            kernel_options,
+            score_mod_other_buffers,
+            mask_mod_other_buffers,
+        )
+    )
+    # For tensor subclasses, give the subclass a chance to run first
+    if any(
+        isinstance(a, torch.Tensor)
+        and type(a) is not torch.Tensor
+        and not isinstance(a, FakeTensor)
+        for a in flat_args
+    ):
+        return NotImplemented
     with mode:
         out, logsumexp = flex_attention_fake_impl(query, value)
         return out, logsumexp
@@ -1086,6 +1131,31 @@ def flex_attention_backward_functionalize(
     since we know that the forward score mod function is assured to be free of mutations
     to the other_buffers, we skip that mutate check and go straight to redispatching.
     """
+    flat_args, _ = pytree.tree_flatten(
+        (
+            query,
+            key,
+            value,
+            out,
+            logsumexp,
+            grad_out,
+            grad_logsumexp,
+            block_mask,
+            scale,
+            kernel_options,
+            score_mod_other_buffers,
+            mask_mod_other_buffers,
+        )
+    )
+    # For tensor subclasses, give the subclass a chance to run first
+    if any(
+        isinstance(a, torch.Tensor)
+        and type(a) is not torch.Tensor
+        and not isinstance(a, FakeTensor)
+        and not isinstance(a, FunctionalTensor)
+        for a in flat_args
+    ):
+        return NotImplemented
     query_unwrapped = ctx.unwrap_tensors(query)
     key_unwrapped = ctx.unwrap_tensors(key)
     value_unwrapped = ctx.unwrap_tensors(value)
@@ -1158,6 +1228,30 @@ def flex_attention_backward_fake_tensor_mode(
 ) -> tuple[
     torch.Tensor, torch.Tensor, torch.Tensor, tuple[Optional[torch.Tensor], ...]
 ]:
+    flat_args, _ = pytree.tree_flatten(
+        (
+            query,
+            key,
+            value,
+            out,
+            logsumexp,
+            grad_out,
+            grad_logsumexp,
+            block_mask,
+            scale,
+            kernel_options,
+            score_mod_other_buffers,
+            mask_mod_other_buffers,
+        )
+    )
+    # For tensor subclasses, give the subclass a chance to run first
+    if any(
+        isinstance(a, torch.Tensor)
+        and type(a) is not torch.Tensor
+        and not isinstance(a, FakeTensor)
+        for a in flat_args
+    ):
+        return NotImplemented
     with mode:
         Bq, _, _, qk_head_dim = query.shape
         Bkv, Hkv, seq_len_kv, v_head_dim = value.shape