pytorch
diff --git a/‎aten/src/ATen/mps/MPSDevice.h
Lines changed: 1 addition & 0 deletions b/‎aten/src/ATen/mps/MPSDevice.h
Lines changed: 1 addition & 0 deletions
diff --git a/‎aten/src/ATen/mps/MPSDevice.mm
Lines changed: 7 additions & 0 deletions b/‎aten/src/ATen/mps/MPSDevice.mm
Lines changed: 7 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/Convolution.cpp
Lines changed: 1 addition & 1 deletion b/‎aten/src/ATen/native/Convolution.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎aten/src/ATen/native/mps/MPSGraphSequoiaOps.h
Lines changed: 27 additions & 0 deletions b/‎aten/src/ATen/native/mps/MPSGraphSequoiaOps.h
Lines changed: 27 additions & 0 deletions
diff --git a/‎aten/src/ATen/native/mps/OperationUtils.h
Lines changed: 7 additions & 2 deletions b/‎aten/src/ATen/native/mps/OperationUtils.h
Lines changed: 7 additions & 2 deletions
diff --git a/‎aten/src/ATen/native/mps/OperationUtils.mm
Lines changed: 209 additions & 15 deletions b/‎aten/src/ATen/native/mps/OperationUtils.mm
Lines changed: 209 additions & 15 deletions
@@ -31,6 +31,7 @@ enum class MacOSVersion : uint32_t {
   MACOS_VER_13_2_PLUS,
   MACOS_VER_13_3_PLUS,
   MACOS_VER_14_0_PLUS,
+  MACOS_VER_15_0_PLUS,
 };
 
 //-----------------------------------------------------------------
 
@@ -37,7 +37,11 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
   if (!_mtl_indexing_library) {
     MTLCompileOptions* options = [MTLCompileOptions new];
     [options setLanguageVersion:getMetalLanguageVersion(_mtl_device, isMacOS13Plus(MacOSVersion::MACOS_VER_13_0_PLUS))];
+#if defined(__MAC_15_0)
+    options.mathMode = MTLMathModeFast;
+#else
     [options setFastMathEnabled:YES];
+#endif
     _mtl_indexing_library = [_mtl_device newLibraryWithSource:[NSString stringWithCString:mps::indexing_metal_shaders
                                                                                  encoding:NSASCIIStringEncoding]
                                                       options:options
@@ -118,6 +122,7 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
   static bool _macos_13_3_plus = [compileOptions respondsToSelector:@selector(maxTotalThreadsPerThreadgroup)] == YES;
 
   static bool _macos_14_0_plus = [mpsCD instancesRespondToSelector:@selector(conjugateWithTensor:name:)] == YES;
+  static bool _macos_15_0_plus = [mpsCD respondsToSelector:@selector(variableFromTensorWithTensor:name:)] == YES;
 
   switch (version) {
     case MacOSVersion::MACOS_VER_13_0_PLUS:
@@ -130,6 +135,8 @@ static inline MTLLanguageVersion getMetalLanguageVersion(const id<MTLDevice>& de
       return _macos_13_3_plus;
     case MacOSVersion::MACOS_VER_14_0_PLUS:
       return _macos_14_0_plus;
+    case MacOSVersion::MACOS_VER_15_0_PLUS:
+      return _macos_15_0_plus;
     default:
       return false;
   }
 
@@ -1665,7 +1665,7 @@ at::Tensor _convolution(
                "Input type (", input.toString(), ") and bias type (", bias.toString(),
                ") should be the same");
 
-      output = at::_mps_convolution(input.contiguous(), weight, bias.defined() ? bias.contiguous() : bias,
+      output = at::_mps_convolution(input, weight, bias.defined() ? bias.contiguous() : bias,
                                      params.padding, params.stride, params.dilation,
                                      params.groups);
 #else
 
@@ -0,0 +1,27 @@
+#pragma once
+
+#include <MetalPerformanceShadersGraph/MetalPerformanceShadersGraph.h>
+
+#if !defined(__MAC_15_0) && \
+    (!defined(MAC_OS_X_VERSION_15_0) || (MAC_OS_X_VERSION_MIN_REQUIRED < MAC_OS_X_VERSION_15_0))
+
+@interface MPSNDArrayIdentity : MPSNDArrayUnaryKernel
+-(MPSNDArray * __nullable) reshapeWithCommandBuffer: (__nullable id <MTLCommandBuffer>) cmdBuf
+                                        sourceArray: (MPSNDArray * __nonnull) sourceArray
+                                              shape: (MPSShape * __nonnull) shape
+                                   destinationArray: (MPSNDArray * __nullable) destinationArray;
+@end
+
+@interface MPSNDArrayDescriptor()
+@property (readwrite, nonatomic) BOOL preferPackedRows;
+@end
+
+@interface MPSNDArray()
+-(nonnull instancetype) initWithBuffer:(id<MTLBuffer> _Nonnull) buffer
+                                offset:(NSUInteger) offset
+                            descriptor:(MPSNDArrayDescriptor * _Nonnull) descriptor;
+-(MPSNDArray * __nullable) arrayViewWithShape:(MPSShape * _Nullable) shape
+                                      strides:(MPSShape * _Nonnull)  strides;
+@end
+
+#endif
@@ -88,7 +88,10 @@ MPSGraphTensorData* getMPSGraphTensorDataForView(const Tensor& src, MPSShape *mp
 MPSGraphTensor* castToIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const Tensor& input, bool includesInt64 = false);
 MPSGraphTensor* castFromIHFTypes(MPSGraph* mpsGraph, MPSGraphTensor* inputTensor, const Tensor& input, bool includesInt64 = false);
 
+MPSNDArray* getMPSNDArray(const at::Tensor& t, const IntArrayRef& sizes = {}, const IntArrayRef& strides = {});
+MPSNDArray* getMPSNDArray(const at::Tensor& t, MPSShape* sizes = nil, MPSShape* strides = nil);
 // The MPSShape could vary based on memory format
+Tensor getTensorView(const Tensor& t, MPSShape* shape);
 MPSShape* getMPSShape(const Tensor& t, c10::MemoryFormat memory_format = MemoryFormat::Contiguous);
 MPSShape* getMPSShape(IntArrayRef sizes, c10::MemoryFormat memory_format = MemoryFormat::Contiguous);
 
@@ -100,8 +103,9 @@ class Placeholder {
  public:
   Placeholder() : _placeholder(nullptr), _value(nullptr), _tensor(Tensor()) {}
   Placeholder(MPSGraphTensor* mpsGraphTensor) : _placeholder(mpsGraphTensor), _value(nullptr), _tensor(Tensor()) {}
+  Placeholder(MPSGraphTensor* mpsGraphTensor, MPSNDArray* mpsNDArray);
   Placeholder(MPSGraphTensor* mpsGraphTensor, const Tensor& self, MPSShape *mpsShape = nullptr,
-              bool gatherTensorData = true, MPSDataType dataType = MPSDataTypeInvalid);
+              bool gatherTensorData = true, MPSDataType dataType = MPSDataTypeInvalid, bool useMPSStridedAPI = true);
   MPSGraphTensor* getMPSGraphTensor() {
     return _placeholder;
   }
@@ -431,7 +435,8 @@ inline bool supportedFloatingOrComplexType(const Tensor& t) {
 
 
 inline bool needsGather(const Tensor& t) {
-  return !t.is_contiguous() || t.storage_offset();
+  static const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
+  return !is_macOS_15_0_or_newer && (!t.is_contiguous() || t.storage_offset()) ;
 }
 
 } // namespace at::native::mps
@@ -3,6 +3,7 @@
 #include <ATen/TensorIterator.h>
 #include <ATen/mps/MPSAllocatorInterface.h>
 #include <ATen/mps/MPSProfiler.h>
+#include <ATen/native/mps/MPSGraphSequoiaOps.h>
 #include <ATen/native/mps/MPSGraphSonomaOps.h>
 #include <ATen/native/mps/MPSGraphVenturaOps.h>
 #include <ATen/native/mps/OperationUtils.h>
@@ -303,6 +304,16 @@ MPSDataType getMPSScalarType(ScalarType scalar_type) {
   return str;
 }
 
+Tensor getTensorView(const Tensor& t, MPSShape* shape) {
+  std::vector<int64_t> res;
+  res.reserve([shape count]);
+  for (NSNumber* elem in shape) {
+    res.push_back(elem.longLongValue);
+  }
+  IntArrayRef r = IntArrayRef(res);
+  return t.view(res);
+}
+
 MPSShape* getMPSShape(const Tensor& t, c10::MemoryFormat memory_format) {
   return getMPSShape(t.sizes(), memory_format);
 }
@@ -359,26 +370,152 @@ void printTensorNDArray(const Tensor& t) {
   return [tmpGraphTensorData mpsndarray];
 }
 
+static std::vector<int64_t> getSortedStrides(const IntArrayRef& s) {
+  std::vector<int64_t> idx(s.size());
+  iota(idx.begin(), idx.end(), 0);
+  sort(idx.begin(), idx.end(), [&s](size_t i1, size_t i2) { return s[i1] > s[i2]; });
+
+  return idx;
+}
+
+static std::vector<int64_t> inversePermutation(std::vector<int64_t> permuteOrder) {
+  auto size = permuteOrder.size();
+  std::vector<int64_t> inversePerm(permuteOrder.size());
+
+  for (int i = 0; i < size; i++) {
+    inversePerm[permuteOrder[i]] = i;
+  }
+  return inversePerm;
+}
+
+static MPSNDArray* permuteNDArray(MPSNDArray* inArray, std::vector<int64_t> permuteOrder_) {
+  auto permuteOrder = inversePermutation(permuteOrder_);
+  NSUInteger srcRank = [inArray numberOfDimensions];
+  if (srcRank != permuteOrder.size()) {
+    TORCH_INTERNAL_ASSERT(false);
+    return nil;
+  }
+  std::vector<NSUInteger> dimensionOrder(srcRank);
+  std::iota(std::begin(dimensionOrder), std::end(dimensionOrder), 0);
+  MPSNDArrayDescriptor* desc = [inArray descriptor];
+
+  for (int64_t i = srcRank - 1; i >= 0; i--) {
+    NSUInteger axis = permuteOrder[i];
+    auto axisIter = std::find(dimensionOrder.begin(), dimensionOrder.end(), axis);
+    NSUInteger axis1 = srcRank - i - 1;
+    NSUInteger axis2 = dimensionOrder.end() - axisIter - 1;
+    iter_swap(dimensionOrder.begin() + i, axisIter);
+    if (axis1 != axis2) {
+      [desc transposeDimension:axis1 withDimension:axis2];
+    }
+  }
+  C10_CLANG_DIAGNOSTIC_PUSH()
+#if C10_CLANG_HAS_WARNING("-Wnonnull")
+  C10_CLANG_DIAGNOSTIC_IGNORE("-Wnonnull")
+#endif
+  MPSNDArray* result = [inArray arrayViewWithCommandBuffer:nil descriptor:desc aliasing:MPSAliasingStrategyShallAlias];
+  C10_CLANG_DIAGNOSTIC_POP()
+
+  TORCH_INTERNAL_ASSERT(result != nil);
+  return result;
+}
+
+MPSNDArray* getMPSNDArray(const at::Tensor& t, MPSShape* sizes, MPSShape* strides) {
+  id<MTLBuffer> srcBuf = getMTLBufferStorage(t);
+
+  MPSDataType mpsDataType = getMPSDataType(t.scalar_type());
+  MPSNDArrayDescriptor* srcTensorDesc = [MPSNDArrayDescriptor descriptorWithDataType:mpsDataType shape:sizes];
+  srcTensorDesc.preferPackedRows = YES;
+  MPSNDArray* srcNDArray = [[[MPSNDArray alloc] initWithBuffer:srcBuf
+                                                        offset:t.storage_offset() * t.element_size()
+                                                    descriptor:srcTensorDesc] autorelease];
+  if (strides != nil) {
+    srcNDArray = [srcNDArray arrayViewWithShape:sizes strides:strides];
+  }
+  return srcNDArray;
+}
+
+MPSNDArray* getMPSNDArray(const at::Tensor& t, const IntArrayRef& sizes, const IntArrayRef& strides) {
+  return getMPSNDArray(t, getMPSShape(sizes.empty() ? t.sizes() : sizes), strides.empty() ? nil : getMPSShape(strides));
+}
+
+static MPSNDArray* getStridedMPSNDArray(const at::Tensor& src, MPSNDArray* srcNDArray) {
+  auto strides = src.strides();
+  auto sizes = src.sizes();
+  auto nStrides = strides.size();
+  auto nonZeroStrides = src.strides();
+  int64_t crtNonZeroStride = 1;
+  bool hasZeroStrides = false;
+  auto sortedStridesIndices = getSortedStrides(nonZeroStrides);
+
+  NSMutableArray<NSNumber*>* sortedStridesShape = [NSMutableArray arrayWithCapacity:nStrides];
+  NSMutableArray<NSNumber*>* sortedMPSShape = [NSMutableArray arrayWithCapacity:nStrides];
+  for (const auto i : c10::irange(nStrides)) {
+    sortedStridesShape[i] = [NSNumber numberWithInteger:nonZeroStrides[sortedStridesIndices[i]]];
+    sortedMPSShape[i] = [NSNumber numberWithInteger:sizes[sortedStridesIndices[i]]];
+  }
+  MPSShape* originalSortedMPSShape = sortedMPSShape;
+  MPSShape* originalSortedStridesShape = sortedStridesShape;
+  bool hasNonZeroStrides = nStrides == 0 ? false : nonZeroStrides[sortedStridesIndices[nStrides - 1]] != 1;
+  if (hasNonZeroStrides) {
+    originalSortedMPSShape = [sortedMPSShape copy];
+    originalSortedStridesShape = [sortedStridesShape copy];
+    [sortedStridesShape addObject:[NSNumber numberWithInteger:1]];
+    [sortedMPSShape addObject:[NSNumber numberWithInteger:1]];
+  }
+  if (nStrides == 0) {
+    originalSortedMPSShape = getMPSShape(src);
+    originalSortedStridesShape = getMPSShape(src.strides());
+  }
+
+  srcNDArray = [srcNDArray arrayViewWithShape:sortedMPSShape strides:sortedStridesShape];
+  if (hasNonZeroStrides) {
+    MPSNDArrayIdentity* identity =
+        [[[MPSNDArrayIdentity alloc] initWithDevice:MPSDevice::getInstance()->device()] autorelease];
+    srcNDArray = [identity reshapeWithCommandBuffer:nil
+                                        sourceArray:srcNDArray
+                                              shape:originalSortedMPSShape
+                                   destinationArray:nil];
+  }
+  TORCH_INTERNAL_ASSERT(srcNDArray);
+
+  srcNDArray = permuteNDArray(srcNDArray, sortedStridesIndices);
+  TORCH_INTERNAL_ASSERT(srcNDArray);
+
+  return srcNDArray;
+}
+
+Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor, MPSNDArray* mpsNDArray) {
+  _placeholder = mpsGraphTensor;
+  _value = [[[MPSGraphTensorData alloc] initWithMPSNDArray:mpsNDArray] autorelease];
+}
+
 Placeholder::Placeholder(MPSGraphTensor* mpsGraphTensor,
                          const Tensor& src,
-                         MPSShape* mpsShape,
+                         MPSShape* mpsShape_,
                          bool gatherTensorData,
-                         MPSDataType dataType)
+                         MPSDataType dataType,
+                         bool useMPSStridedAPI)
     : _tensor(src) {
   TORCH_CHECK(src.is_mps(), "Placeholder storage has not been allocated on MPS device!");
   // extract the pointer to MTLBuffer from the Tensor's storage
   id<MTLBuffer> srcBuf = getMTLBufferStorage(src);
-  // a view tensor could be contiguous (e.g., slice ops) or non-contiguous (e.g., transpose())
-  if (needsGather(src) && gatherTensorData) {
-    Tensor emptyShell = Tensor();
-    // use "_tensor" from Placeholder to retain view's output during its usage in other ops
-    _tensor = gatherViewTensor(src, emptyShell);
-    if (!_tensor.has_storage()) {
-      // if we cannot gather, we make the tensor contiguous implicitly, and keep
-      // it in placeholder to be able to retrieve it when we return from constructor
-      _tensor = src.clone(MemoryFormat::Contiguous);
+
+  const bool is_macOS_15_0_or_newer = is_macos_13_or_newer(MacOSVersion::MACOS_VER_15_0_PLUS);
+  // Use gather kernel to solve strides for macOS < 15.0
+  // Starting with macOS 15.0, MPS supports native strides direclty in the kernels
+  if (!is_macOS_15_0_or_newer || !useMPSStridedAPI) {
+    if ((!src.is_contiguous() || src.storage_offset()) && gatherTensorData) {
+      Tensor emptyShell = Tensor();
+      // use "_tensor" from Placeholder to retain view's output during its usage in other ops
+      _tensor = gatherViewTensor(src, emptyShell);
+      if (!_tensor.has_storage()) {
+        // if we cannot gather, we make the tensor contiguous implicitly, and keep
+        // it in placeholder to be able to retrieve it when we return from constructor
+        _tensor = src.clone(MemoryFormat::Contiguous);
+      }
+      srcBuf = getMTLBufferStorage(_tensor);
     }
-    srcBuf = getMTLBufferStorage(_tensor);
   }
 
   // tensor.numel() could be zero, but tensor is valid as long as the buffer size is non-zero.
@@ -389,9 +526,66 @@ void printTensorNDArray(const Tensor& t) {
     const auto scalar_type = _tensor.scalar_type();
     dataType = _tensor.dim() == 0 ? getMPSScalarType(scalar_type) : getMPSDataType(scalar_type);
   }
-  _value = [[[MPSGraphTensorData alloc] initWithMTLBuffer:srcBuf
-                                                    shape:mpsShape ? mpsShape : getMPSShape(_tensor)
-                                                 dataType:dataType] autorelease];
+
+  // Tensor is contiguous and has no storage offset.
+  // Wrap it directly inside MPSGraphTensorData
+  if ((_tensor.is_contiguous() && !_tensor.storage_offset()) || !useMPSStridedAPI || !is_macOS_15_0_or_newer) {
+    _value = [[[MPSGraphTensorData alloc] initWithMTLBuffer:srcBuf
+                                                      shape:mpsShape_ ? mpsShape_ : getMPSShape(_tensor)
+                                                   dataType:dataType] autorelease];
+  } else {
+    IntArrayRef view_shape;
+    if (mpsShape_) {
+      _tensor = getTensorView(src, mpsShape_);
+    }
+
+    MPSShape* mpsShape = getMPSShape(_tensor);
+    MPSShape* mpsStrides = getMPSShape(_tensor.strides());
+
+    IntArrayRef baseShape;
+    if (src.is_view()) {
+      baseShape = src._base().sizes();
+    } else {
+      baseShape = getIMPSAllocator()->getBufferShape(src.storage().data());
+    }
+    int flattenedShaped = 1;
+    for (const auto i : c10::irange(baseShape.size())) {
+      flattenedShaped *= baseShape[i];
+    }
+    MPSShape* mpsBaseShape = @[ @(flattenedShaped) ];
+    MPSNDArrayDescriptor* srcTensorDesc = [MPSNDArrayDescriptor descriptorWithDataType:dataType shape:mpsBaseShape];
+    srcTensorDesc.preferPackedRows = YES;
+    MPSNDArray* srcNDArray = [[[MPSNDArray alloc] initWithBuffer:srcBuf
+                                                          offset:src.storage_offset() * src.element_size()
+                                                      descriptor:srcTensorDesc] autorelease];
+    TORCH_INTERNAL_ASSERT(srcNDArray);
+    if (src.dim() != 0) {
+      srcNDArray = getStridedMPSNDArray(_tensor, srcNDArray);
+    } else {
+      bool needsReshape = false;
+      NSMutableArray* mpsExpandedShape = nil;
+      NSMutableArray* mpsExpandedStrides = nil;
+
+      if (src.dim() > 0 && src.stride(-1) != 1) {
+        needsReshape = true;
+        mpsExpandedShape = [NSMutableArray arrayWithArray:mpsShape];
+        mpsExpandedStrides = [NSMutableArray arrayWithArray:mpsStrides];
+        [mpsExpandedShape addObject:@1];
+        [mpsExpandedStrides addObject:@1];
+      }
+      srcNDArray = [srcNDArray arrayViewWithShape:needsReshape ? mpsExpandedShape : getMPSShape(src)
+                                          strides:needsReshape ? mpsExpandedStrides : getMPSShape(src.strides())];
+      TORCH_INTERNAL_ASSERT(srcNDArray);
+
+      if (needsReshape) {
+        MPSNDArrayIdentity* identity =
+            [[[MPSNDArrayIdentity alloc] initWithDevice:MPSDevice::getInstance()->device()] autorelease];
+        srcNDArray = [identity reshapeWithCommandBuffer:nil sourceArray:srcNDArray shape:mpsShape destinationArray:nil];
+      }
+      TORCH_INTERNAL_ASSERT(srcNDArray);
+    }
+    _value = [[[MPSGraphTensorData alloc] initWithMPSNDArray:srcNDArray] autorelease];
+  }
 
   TORCH_INTERNAL_ASSERT(_value);
   _placeholder = mpsGraphTensor;