[CPU][Inductor] Improve A16W4 GEMM template performance by using block_n=32 (#156174)

Xia-Weiwen · pytorchmergebot · commit 1bb9b1858b84 · 2025-06-18T13:17:46.000Z
**Summary** We found that using `block_n=32` brings better performance for A16W4 than `block_n=64` because cache locality is better and parallelism is better if N is small and more cores are used. For example, when running Llama-3.1-8B with A16W4 and batch size = 16 on 43 cores, `block_n=32` is faster by >10% E2E for both first and next token. **Test plan** ``` pytest test/inductor/test_cpu_select_algorithm.py -k test_int4_woq_mm_amx ``` Pull Request resolved: #156174 Approved by: https://github.com/leslie-fang-intel
diff --git a/torch/_inductor/codegen/cpp_gemm_template.py b/torch/_inductor/codegen/cpp_gemm_template.py
@@ -234,7 +234,7 @@
 {%- set tile_X = kernel.slice_nd(X, [("m_start", "m_end"), ("k_start", "k_end")]) %}
                     for (int64_t nci = nc; nci < nc_block_end; nci++) {
 {%- set acc_slice = kernel.slice_nd(acc, [("0", "m_end - m_start"), ("(nci - nc)*Nr", "(nci - nc + 1)*Nr")]) %}
-{%- if template.should_block_weights %}
+{%- if template.should_block_weights and not is_woq_int4 %}
 {%- set tile_W_3d = kernel.slice_nd(W, [("nci", "nci + 1"), ("k_start", "k_end"), ()]) %}
 {%- set tile_W = kernel.view(tile_W_3d, ["k_end - k_start", micro_gemm.register_blocking.block_n]) %}
 {%- else %}
@@ -1125,9 +1125,12 @@ def prep_weight(
         new_size, padded_n = cls.get_padded_size(n, block_n, k, should_block_weight)
         padding = padded_n - n
 
-        if should_block_weight:
+        if should_block_weight and not cls.is_woq_int4():
             blocked_w = cls.block_weight(W, new_size, padding)
             new_inputs[1] = cls.pack_vnni_weight(blocked_w, micro_gemm, new_size)
+        elif should_block_weight:
+            assert cls.is_woq_int4()
+            new_inputs[1] = cls.block_weight(W, new_size, padding)
         elif isinstance(W, ir.IRNode):
             # Require W layout to be fixed & contiguous, happens inplace.
             ir.ExternKernel.require_contiguous(W)
@@ -1689,7 +1692,68 @@ def q_group_size(cls):
             @staticmethod
             def check_if_block_weight(W, micro_gemm):
                 # For WOQ INT4, weight is already packed
-                return False
+                # However, for AMX microkernel, we want to change the blocking of weight
+                from .cpp_micro_gemm import CppMicroGemmWoQInt4Amx
+
+                return isinstance(micro_gemm, CppMicroGemmWoQInt4Amx)
+
+            @classmethod
+            def block_weight(cls, W, new_size, padding):
+                # This method is called only if AMX microkernels are used.
+                # In this case, we unpack and repack weight so that block_n=32
+                # the format of packed weight is described here:
+                # https://github.com/pytorch/pytorch/blob/32eee8ed225d9f10fbbcb38c24b8b44c24c0c97c/aten/src/ATen/native/cpu/int4mm_kernel.cpp#L583
+                if isinstance(W, ir.IRNode):
+                    # in this case, we do nothing
+                    ir.ExternKernel.require_contiguous(W)
+                    blocked_w = W
+                else:
+                    # in this case, we unpack and repack weight
+                    assert isinstance(W, torch.Tensor)
+                    assert W.dim() == 2
+                    N = W.size(0)
+                    K = W.size(-1) * 2
+                    G = cls.q_group_size()
+                    # x and qscales_and_zeros are in bfloat16 instead of float to use the optimized kernel
+                    # so that the unpacking process is faster
+                    x = torch.eye(K).bfloat16()
+                    # Here we use scale=1 and qzero=8 because we want to unpack weight
+                    # without dequantizing it. The qzero here is 8 instead of 0 because
+                    # int4 values are converted to [-7, 8] in the _weight_int4pack_mm_for_cpu kernel:
+                    # https://github.com/pytorch/pytorch/blob/32eee8ed225d9f10fbbcb38c24b8b44c24c0c97c/aten/src/ATen/native/cpu/int4mm_kernel.cpp#L95
+                    qscales_and_zeros = (
+                        torch.tensor([1.0, 8.0])
+                        .bfloat16()
+                        .expand(K // G, N, 2)
+                        .contiguous()
+                    )
+                    # shape: [K, N]
+                    unpacked_w = torch.ops.aten._weight_int4pack_mm_for_cpu(
+                        x,
+                        W,
+                        G,
+                        qscales_and_zeros,
+                    ).to(torch.uint8)
+                    block_n = 32
+                    # shape: [N // block_n, K, block_n]
+                    w_blocked = (
+                        unpacked_w.view(K, N // block_n, block_n)
+                        .permute(1, 0, 2)
+                        .contiguous()
+                    )
+                    # pack 2 int4 -> 1 int8
+                    # block_n: [a0, a1, ..., a15, b0, b1, ..., b15]
+                    # -> [(a0 & 0xf) | (b0 << 4), (a1 & 0xf) | (b1 << 4), ...]
+                    # shape: [N // block_n, K, 2, block_n // 2]
+                    w_blocked = w_blocked.view(N // block_n, K, 2, block_n // 2)
+                    # shape: [N // block_n, K, block_n // 2]
+                    w_blocked_packed = (w_blocked[:, :, 0, :] & 0xF) | (
+                        w_blocked[:, :, 1, :] << 4
+                    )
+                    # shape: [N, K // 2]
+                    blocked_w = w_blocked_packed.view(N, K // 2)
+
+                return blocked_w
 
         return CppWoqInt4GemmTemplateInstance
 
diff --git a/torch/_inductor/codegen/cpp_micro_gemm.py b/torch/_inductor/codegen/cpp_micro_gemm.py
@@ -1231,10 +1231,6 @@ def codegen_define(self, kernel: CppTemplateKernel) -> str:
         else:
             assert block_k == 32, "Only support block_k = 32 for AMX Bfloat16/Float16"
         num_columns = block_n // 16
-        if self.is_woq_int4():
-            # block_n for woq int4 is 64, which is too large for micro kernel
-            # so we split it into 2x32. Here num_columns = 2.
-            num_columns //= 2
         options = {
             "declare_kernel": self.get_kernel_declaration(),
             "use_cached_dequantized_B": (
@@ -1633,8 +1629,8 @@ def is_woq_int4(self):
     *generate_gemm_config(
         VecAMX,
         [  # (block_m, block_n, block_k)
-            (16, 64, 32),
-            (32, 64, 32),
+            (16, 32, 32),
+            (32, 32, 32),
         ],
         input_dtype=torch.bfloat16,
         input2_dtype=torch.uint8,
@@ -1646,8 +1642,8 @@ def is_woq_int4(self):
 class CppMicroGemmWoQInt4Amx(CppMicroGemmAMX):
     """
     This class generates the code for WoQ int4 micro gemm using AMX intrinsics,
-    which are available on 4th and 5th generation Intel Xeon.
-    Shape of packed weight = [N // 64, K, 32], viewed as [N, K // 2]
+    which are available on 4th and newer generations of Intel Xeon.
+    Shape of packed weight = [N // 32, K, 16], viewed as [N, K // 2]
     Shape of packed ScalesAndZeros = [K // group_size, N, 2]
     Reuse TEMPLATE_KERNEL of CppMicroGemmAMX.
     """
@@ -1660,7 +1656,7 @@ class CppMicroGemmWoQInt4Amx(CppMicroGemmAMX):
 {{declare_kernel}} {
     {{kernel.assert_function}}(N % {{block_n}} == 0, "N dimension must be multiple of {{block_n}}");
     {{kernel.assert_function}}(K % 2 == 0, "K dimension must be multiple of 2");
-    {{kernel.assert_function}}({{block_n}} == 64, "block_n must be 64 for WOQ int4");
+    {{kernel.assert_function}}({{block_n}} == 32, "block_n must be 32 for WOQ int4");
 
     // Create a stack-allocated buffer for tiles of B.
     // Except maybe for the tail-case, an AMX tile of B has 16x32 BF16 elements.
@@ -1674,6 +1670,7 @@ class CppMicroGemmWoQInt4Amx(CppMicroGemmAMX):
     const int PREFETCH_SIZE_KB = (PREFETCH_SIZE_K + BLOCK_K - 1) / BLOCK_K;
     const int KB = K / BLOCK_K;
 
+    __m512i b32[COLS * 2];
     __m512 vb[COLS * 2];
     __m512 scale[COLS];
     __m512 zero[COLS];
@@ -1759,7 +1756,7 @@ class CppMicroGemmWoQInt4Amx(CppMicroGemmAMX):
     // Dequantize a B block of 2 * block_n into bf16
     // So, it handles k and k+1 at the same time
     auto dequantize_B = [&](int n) {
-        constexpr int64_t ldb_int4 = BLOCK_N / 2; // 32
+        constexpr int64_t ldb_int4 = BLOCK_N / 2; // 16
         for (int k = 0, kb = 0; k < K; k += 2) {
             // Since block_k must be 32 for AMX microkernels, k_start may not be
             // a multiple of q_group_size. In that case, we need to load scales
@@ -1769,35 +1766,25 @@ class CppMicroGemmWoQInt4Amx(CppMicroGemmAMX):
             }
 
             // load 256 bits = 64 elements in int4
-            __m256i b4 = _mm256_loadu_si256((__m256i*)(B + n * K + k * ldb_int4));
             if (k + PREFETCH_SIZE_K < K) {
                 _mm_prefetch(B + (k + PREFETCH_SIZE_K) * ldb_int4, _MM_HINT_T0);
             }
 
-            __m512i b32 = _mm512_cvtepu8_epi32(_mm256_castsi256_si128(b4));
-            vb[0] = _mm512_permutexvar_ps(b32, lut);
+            __m128i b4 = _mm_loadu_si128((__m128i*)(B + n / 2 * K + k * ldb_int4));
+            b32[0] = _mm512_cvtepu8_epi32(b4);
+            b32[1] = _mm512_srli_epi32(b32[0], 4);
+            vb[0] = _mm512_permutexvar_ps(b32[0] , lut);
             vb[0] = _mm512_fmadd_ps(vb[0], scale[0], zero[0]);
-            vb[2] = _mm512_permutexvar_ps(_mm512_srli_epi32(b32, 4), lut);
-            vb[2] = _mm512_fmadd_ps(vb[2], scale[2], zero[2]);
-
-            b32 = _mm512_cvtepu8_epi32(_mm256_extracti128_si256(b4, 1));
-            vb[1] = _mm512_permutexvar_ps(b32, lut);
+            vb[1] = _mm512_permutexvar_ps(b32[1], lut);
             vb[1] = _mm512_fmadd_ps(vb[1], scale[1], zero[1]);
-            vb[3] = _mm512_permutexvar_ps(_mm512_srli_epi32(b32, 4), lut);
-            vb[3] = _mm512_fmadd_ps(vb[3], scale[3], zero[3]);
 
-            b4 = _mm256_loadu_si256((__m256i*)(B + n * K + (k + 1) * ldb_int4));
-            b32 = _mm512_cvtepu8_epi32(_mm256_castsi256_si128(b4));
-            vb[0 + COLS] = _mm512_permutexvar_ps(b32, lut);
+            b4 = _mm_loadu_si128((__m128i*)(B + n / 2 * K + (k + 1) * ldb_int4));
+            b32[0 + COLS] = _mm512_cvtepu8_epi32(b4);
+            b32[1 + COLS] = _mm512_srli_epi32(b32[0 + COLS], 4);
+            vb[0 + COLS] = _mm512_permutexvar_ps(b32[0 + COLS] , lut);
             vb[0 + COLS] = _mm512_fmadd_ps(vb[0 + COLS], scale[0], zero[0]);
-            vb[2 + COLS] = _mm512_permutexvar_ps(_mm512_srli_epi32(b32, 4), lut);
-            vb[2 + COLS] = _mm512_fmadd_ps(vb[2 + COLS], scale[2], zero[2]);
-
-            b32 = _mm512_cvtepu8_epi32(_mm256_extracti128_si256(b4, 1));
-            vb[1 + COLS] = _mm512_permutexvar_ps(b32, lut);
+            vb[1 + COLS] = _mm512_permutexvar_ps(b32[1 + COLS], lut);
             vb[1 + COLS] = _mm512_fmadd_ps(vb[1 + COLS], scale[1], zero[1]);
-            vb[3 + COLS] = _mm512_permutexvar_ps(_mm512_srli_epi32(b32, 4), lut);
-            vb[3 + COLS] = _mm512_fmadd_ps(vb[3 + COLS], scale[3], zero[3]);
 
             for (int i = 0; i < COLS; i++) {
                 // convert to VNNI
@@ -1811,57 +1798,52 @@ class CppMicroGemmWoQInt4Amx(CppMicroGemmAMX):
                 auto v = _mm512_castsi256_si512(v0_bf16);
                 v = _mm512_inserti64x4(v, v1_bf16, 1);
                 // store the VNNI format bfloat16 values
-                // split block_n into 2x32
-                {{input_t}}* addr = dequantized_B_buf + K * 32 * (i / 2) + k * 32 + (i % 2) * 32;
+                {{input_t}}* addr = dequantized_B_buf + k * 32 + (i % 2) * 32;
                 _mm512_storeu_si512(addr, v);
             }
         }
     };
 
-    const int64_t updated_ldb = {{block_n}} / 2;
     for (int64_t n = 0; n < N; n += {{block_n}}) {
         // Dequantize K * block_n int8 B elements into BF16
         dequantize_B(n);
-        // for woq int4, block_n is 64, which is too large for micro kernel
-        for (int64_t ni = 0; ni < {{block_n}}; ni += 32) {
-            for (int64_t m = 0; m < M; m += {{block_m}}) {
-                int64_t block_m = std::min<int64_t>(M - m, {{block_m}});
-                int64_t m_tail = m;
-            {%- for num_rows in range(block_m, 0, -16) %}
-                {%- if num_rows != block_m %}
-                else
-            {%- endif %}
-                if (block_m >= {{num_rows}}) {
-                    {{kernel_name}}_amx_kernel_{{num_rows}}_{{num_columns}}<accum>(
-                        amx_state,
-                        A + m * lda,
-                        dequantized_B_buf + ni * K,
-                        C + m * ldc + n + ni,
-                        K,
-                        lda,
-                        updated_ldb,
-                        ldc,
-                        16
-                    );
-                    block_m -= {{num_rows}};
-                    m_tail += {{num_rows}};
-                }
-            {%- endfor %}
-                if (block_m > 0) {
-                    {{kernel_name}}_amx_kernel_16_{{num_columns}}<accum>(
-                        amx_state,
-                        A + m_tail * lda,
-                        dequantized_B_buf + ni * K,
-                        C + m_tail * ldc + n + ni,
-                        K,
-                        lda,
-                        updated_ldb,
-                        ldc,
-                        block_m
-                    );
-                }
-            } // for m
-        } // for ni
+        for (int64_t m = 0; m < M; m += {{block_m}}) {
+            int64_t block_m = std::min<int64_t>(M - m, {{block_m}});
+            int64_t m_tail = m;
+        {%- for num_rows in range(block_m, 0, -16) %}
+            {%- if num_rows != block_m %}
+            else
+        {%- endif %}
+            if (block_m >= {{num_rows}}) {
+                {{kernel_name}}_amx_kernel_{{num_rows}}_{{num_columns}}<accum>(
+                    amx_state,
+                    A + m * lda,
+                    dequantized_B_buf + n * K,
+                    C + m * ldc + n,
+                    K,
+                    lda,
+                    {{block_n}},
+                    ldc,
+                    16
+                );
+                block_m -= {{num_rows}};
+                m_tail += {{num_rows}};
+            }
+        {%- endfor %}
+            if (block_m > 0) {
+                {{kernel_name}}_amx_kernel_16_{{num_columns}}<accum>(
+                    amx_state,
+                    A + m_tail * lda,
+                    dequantized_B_buf + n * K,
+                    C + m_tail * ldc + n,
+                    K,
+                    lda,
+                    {{block_n}},
+                    ldc,
+                    block_m
+                );
+            }
+        } // for m
     } // for n
 }
 """