From c40e64768482d7df014974595692b16def41827f Mon Sep 17 00:00:00 2001 From: Daniel Murphy Date: Sun, 14 Jul 2024 16:55:30 -0700 Subject: [PATCH 1/6] Replace swizzles with shuffles, remove unnecessary math complexity --- libs/zmath/src/zmath.zig | 92 +++++++++++++++++++++++++++------------- 1 file changed, 63 insertions(+), 29 deletions(-) diff --git a/libs/zmath/src/zmath.zig b/libs/zmath/src/zmath.zig index 34c57d221..e22550d38 100644 --- a/libs/zmath/src/zmath.zig +++ b/libs/zmath/src/zmath.zig @@ -340,10 +340,9 @@ pub inline fn splatInt(comptime T: type, value: u32) T { } pub fn load(mem: []const f32, comptime T: type, comptime len: u32) T { - var v = splat(T, 0.0); + var v: T = @splat(0); const loop_len = if (len == 0) veclen(T) else len; - comptime var i: u32 = 0; - inline while (i < loop_len) : (i += 1) { + inline for (0..loop_len) |i| { v[i] = mem[i]; } return v; @@ -474,12 +473,28 @@ pub fn all(vb: anytype, comptime len: u32) bool { if (len > veclen(T)) { @compileError("zmath.all(): 'len' is greater than vector len of type " ++ @typeName(T)); } - const loop_len = if (len == 0) veclen(T) else len; - const ab: [veclen(T)]bool = vb; + const lenOrVecLen = comptime if (len == 0) veclen(T) else len; + // Handle int and bool types that can use @reduce. + const childType = @typeInfo(T).Vector.child; + if (childType == bool or childType == std.builtin.Type.Int) { + if (lenOrVecLen == veclen(T)) { + return @reduce(.And, vb); + } + const resizeMask = comptime blk: { + var mask: [len]i32 = undefined; + for (0..len) |i| { + mask[i] = i; + } + break :blk mask; + }; + const resized = @shuffle(childType, vb, undefined, resizeMask); + return @reduce(.And, resized); + } + // Float vectors don't support '.And', so manually loop. comptime var i: u32 = 0; var result = true; - inline while (i < loop_len) : (i += 1) { - result = result and ab[i]; + inline while (i < lenOrVecLen) : (i += 1) { + result = result and (vb[i] != 0); } return result; } @@ -488,12 +503,19 @@ test "zmath.all" { try expect(all(boolx8(true, true, true, true, true, false, true, false), 6) == false); try expect(all(boolx8(true, true, true, true, false, false, false, false), 4) == true); try expect(all(boolx4(true, true, true, false), 3) == true); + try expect(all(boolx4(true, true, true, true), 4) == true); + try expect(all(boolx4(true, true, false, true), 4) == false); try expect(all(boolx4(true, true, true, false), 1) == true); try expect(all(boolx4(true, false, false, false), 1) == true); try expect(all(boolx4(false, true, false, false), 1) == false); try expect(all(boolx8(true, true, true, true, true, false, true, false), 0) == false); try expect(all(boolx4(false, true, false, false), 0) == false); try expect(all(boolx4(true, true, true, true), 0) == true); + try expect(all(f32x4(1, 1, 1, 1), 0) == true); + try expect(all(f32x4(0, 0, 1, 0), 0) == false); + try expect(all(f32x4(0, 0, 0, 0), 0) == false); + try expect(all(f32x4(0, 0, 0, 1), 1) == false); + try expect(all(f32x4(1, 0, 0, 0), 1) == true); } pub fn any(vb: anytype, comptime len: u32) bool { @@ -501,12 +523,26 @@ pub fn any(vb: anytype, comptime len: u32) bool { if (len > veclen(T)) { @compileError("zmath.any(): 'len' is greater than vector len of type " ++ @typeName(T)); } - const loop_len = if (len == 0) veclen(T) else len; - const ab: [veclen(T)]bool = vb; + const lenOrVecLen = comptime if (len == 0) veclen(T) else len; + const childType = @typeInfo(T).Vector.child; + if (childType == bool or childType == std.builtin.Type.Int) { + if (lenOrVecLen == veclen(T)) { + return @reduce(.Or, vb); + } + const resizeMask = comptime blk: { + var mask: [len]i32 = undefined; + for (0..len) |i| { + mask[i] = i; + } + break :blk mask; + }; + const resized = @shuffle(childType, vb, undefined, resizeMask); + return @reduce(.Or, resized); + } comptime var i: u32 = 0; var result = false; - inline while (i < loop_len) : (i += 1) { - result = result or ab[i]; + inline while (i < lenOrVecLen) : (i += 1) { + result = result or (vb[i] != 0); } return result; } @@ -514,6 +550,10 @@ test "zmath.any" { try expect(any(boolx8(true, true, true, true, true, false, true, false), 0) == true); try expect(any(boolx8(false, false, false, true, true, false, true, false), 3) == false); try expect(any(boolx8(false, false, false, false, false, true, false, false), 4) == false); + try expect(any(f32x4(1, 1, 1, 1), 0) == true); + try expect(any(f32x4(0, 0, 0, 0), 0) == false); + try expect(any(f32x4(1, 0, 0, 1), 1) == true); + try expect(any(f32x4(0, 0, 0, 1), 1) == false); } pub inline fn isNearEqual( @@ -1911,10 +1951,8 @@ test "zmath.atan2" { // // ------------------------------------------------------------------------------ pub inline fn dot2(v0: Vec, v1: Vec) F32x4 { - var xmm0 = v0 * v1; // | x0*x1 | y0*y1 | -- | -- | - const xmm1 = swizzle(xmm0, .y, .x, .x, .x); // | y0*y1 | -- | -- | -- | - xmm0 = f32x4(xmm0[0] + xmm1[0], xmm0[1], xmm0[2], xmm0[3]); // | x0*x1 + y0*y1 | -- | -- | -- | - return swizzle(xmm0, .x, .x, .x, .x); + const xmm0 = v0 * v1; + return @splat(xmm0[0] + xmm0[1]); } test "zmath.dot2" { const v0 = f32x4(-1.0, 2.0, 300.0, -2.0); @@ -1935,12 +1973,8 @@ test "zmath.dot3" { } pub inline fn dot4(v0: Vec, v1: Vec) F32x4 { - var xmm0 = v0 * v1; // | x0*x1 | y0*y1 | z0*z1 | w0*w1 | - var xmm1 = swizzle(xmm0, .y, .x, .w, .x); // | y0*y1 | -- | w0*w1 | -- | - xmm1 = xmm0 + xmm1; // | x0*x1 + y0*y1 | -- | z0*z1 + w0*w1 | -- | - xmm0 = swizzle(xmm1, .z, .x, .x, .x); // | z0*z1 + w0*w1 | -- | -- | -- | - xmm0 = f32x4(xmm0[0] + xmm1[0], xmm0[1], xmm0[2], xmm0[2]); // addss - return swizzle(xmm0, .x, .x, .x, .x); + const xmm0 = v0 * v1; // | x0*x1 | y0*y1 | z0*z1 | w0*w1 | + return @splat(xmm0[0] + xmm0[1] + xmm0[2] + xmm0[3]); } test "zmath.dot4" { const v0 = f32x4(-1.0, 2.0, 3.0, -2.0); @@ -1950,11 +1984,11 @@ test "zmath.dot4" { } pub inline fn cross3(v0: Vec, v1: Vec) Vec { - var xmm0 = swizzle(v0, .y, .z, .x, .w); - var xmm1 = swizzle(v1, .z, .x, .y, .w); + var xmm0 = @shuffle(f32, v0, undefined, [4]i32{ 1, 2, 0, 2 }); + var xmm1 = @shuffle(f32, v1, undefined, [4]i32{ 2, 0, 1, 3 }); var result = xmm0 * xmm1; - xmm0 = swizzle(xmm0, .y, .z, .x, .w); - xmm1 = swizzle(xmm1, .z, .x, .y, .w); + xmm0 = @shuffle(f32, xmm0, undefined, [4]i32{ 1, 2, 0, 3 }); + xmm1 = @shuffle(f32, xmm1, undefined, [4]i32{ 2, 0, 1, 3 }); result = result - xmm0 * xmm1; return andInt(result, f32x4_mask3); } @@ -2153,10 +2187,10 @@ fn mulMat(m0: Mat, m1: Mat) Mat { var result: Mat = undefined; comptime var row: u32 = 0; inline while (row < 4) : (row += 1) { - const vx = swizzle(m0[row], .x, .x, .x, .x); - const vy = swizzle(m0[row], .y, .y, .y, .y); - const vz = swizzle(m0[row], .z, .z, .z, .z); - const vw = swizzle(m0[row], .w, .w, .w, .w); + const vx = @shuffle(f32, m0[row], undefined, [4]i32{ 0, 0, 0, 0 }); + const vy = @shuffle(f32, m0[row], undefined, [4]i32{ 1, 1, 1, 1 }); + const vz = @shuffle(f32, m0[row], undefined, [4]i32{ 2, 2, 2, 2 }); + const vw = @shuffle(f32, m0[row], undefined, [4]i32{ 3, 3, 3, 3 }); result[row] = mulAdd(vx, m1[0], vz * m1[2]) + mulAdd(vy, m1[1], vw * m1[3]); } return result; From c2c705a21cc69a3f57012075d4e4f85cc207b8f8 Mon Sep 17 00:00:00 2001 From: Daniel Murphy Date: Sun, 14 Jul 2024 17:19:52 -0700 Subject: [PATCH 2/6] Upgrade all swizzles --- libs/zmath/src/zmath.zig | 172 +++++++++++++++++++-------------------- 1 file changed, 86 insertions(+), 86 deletions(-) diff --git a/libs/zmath/src/zmath.zig b/libs/zmath/src/zmath.zig index e22550d38..42b453cfd 100644 --- a/libs/zmath/src/zmath.zig +++ b/libs/zmath/src/zmath.zig @@ -2509,31 +2509,31 @@ pub fn orthographicOffCenterRhGl(left: f32, right: f32, top: f32, bottom: f32, n } pub fn determinant(m: Mat) F32x4 { - var v0 = swizzle(m[2], .y, .x, .x, .x); - var v1 = swizzle(m[3], .z, .z, .y, .y); - var v2 = swizzle(m[2], .y, .x, .x, .x); - var v3 = swizzle(m[3], .w, .w, .w, .z); - var v4 = swizzle(m[2], .z, .z, .y, .y); - var v5 = swizzle(m[3], .w, .w, .w, .z); + var v0 = @shuffle(f32, m[2], undefined, [4]i32{ 1, 0, 0, 0 }); + var v1 = @shuffle(f32, m[3], undefined, [4]i32{ 2, 2, 1, 1 }); + var v2 = @shuffle(f32, m[2], undefined, [4]i32{ 1, 0, 0, 0 }); + var v3 = @shuffle(f32, m[3], undefined, [4]i32{ 3, 3, 3, 2 }); + var v4 = @shuffle(f32, m[2], undefined, [4]i32{ 2, 2, 1, 1 }); + var v5 = @shuffle(f32, m[3], undefined, [4]i32{ 3, 3, 3, 2 }); var p0 = v0 * v1; var p1 = v2 * v3; var p2 = v4 * v5; - v0 = swizzle(m[2], .z, .z, .y, .y); - v1 = swizzle(m[3], .y, .x, .x, .x); - v2 = swizzle(m[2], .w, .w, .w, .z); - v3 = swizzle(m[3], .y, .x, .x, .x); - v4 = swizzle(m[2], .w, .w, .w, .z); - v5 = swizzle(m[3], .z, .z, .y, .y); + v0 = @shuffle(f32, m[2], undefined, [4]i32{ 2, 2, 1, 1 }); + v1 = @shuffle(f32, m[3], undefined, [4]i32{ 1, 0, 0, 0 }); + v2 = @shuffle(f32, m[2], undefined, [4]i32{ 3, 3, 3, 2 }); + v3 = @shuffle(f32, m[3], undefined, [4]i32{ 1, 0, 0, 0 }); + v4 = @shuffle(f32, m[2], undefined, [4]i32{ 3, 3, 3, 2 }); + v5 = @shuffle(f32, m[3], undefined, [4]i32{ 2, 2, 1, 1 }); p0 = mulAdd(-v0, v1, p0); p1 = mulAdd(-v2, v3, p1); p2 = mulAdd(-v4, v5, p2); - v0 = swizzle(m[1], .w, .w, .w, .z); - v1 = swizzle(m[1], .z, .z, .y, .y); - v2 = swizzle(m[1], .y, .x, .x, .x); + v0 = @shuffle(f32, m[1], undefined, [4]i32{ 3, 3, 3, 2 }); + v1 = @shuffle(f32, m[1], undefined, [4]i32{ 2, 2, 1, 1 }); + v2 = @shuffle(f32, m[1], undefined, [4]i32{ 1, 0, 0, 0 }); const s = m[0] * f32x4(1.0, -1.0, 1.0, -1.0); var r = v0 * p0; @@ -2569,10 +2569,10 @@ pub fn inverseDet(m: Mat, out_det: ?*F32x4) Mat { var v0: [4]F32x4 = undefined; var v1: [4]F32x4 = undefined; - v0[0] = swizzle(mt[2], .x, .x, .y, .y); - v1[0] = swizzle(mt[3], .z, .w, .z, .w); - v0[1] = swizzle(mt[0], .x, .x, .y, .y); - v1[1] = swizzle(mt[1], .z, .w, .z, .w); + v0[0] = @shuffle(f32, mt[2], undefined, [4]i32{ 0, 0, 1, 1 }); + v1[0] = @shuffle(f32, mt[3], undefined, [4]i32{ 2, 3, 2, 3 }); + v0[1] = @shuffle(f32, mt[0], undefined, [4]i32{ 0, 0, 1, 1 }); + v1[1] = @shuffle(f32, mt[1], undefined, [4]i32{ 2, 3, 2, 3 }); v0[2] = @shuffle(f32, mt[2], mt[0], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); v1[2] = @shuffle(f32, mt[3], mt[1], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }); @@ -2580,10 +2580,10 @@ pub fn inverseDet(m: Mat, out_det: ?*F32x4) Mat { var d1 = v0[1] * v1[1]; var d2 = v0[2] * v1[2]; - v0[0] = swizzle(mt[2], .z, .w, .z, .w); - v1[0] = swizzle(mt[3], .x, .x, .y, .y); - v0[1] = swizzle(mt[0], .z, .w, .z, .w); - v1[1] = swizzle(mt[1], .x, .x, .y, .y); + v0[0] = @shuffle(f32, mt[2], undefined, [4]i32{ 2, 3, 2, 3 }); + v1[0] = @shuffle(f32, mt[3], undefined, [4]i32{ 0, 0, 1, 1 }); + v0[1] = @shuffle(f32, mt[0], undefined, [4]i32{ 2, 3, 2, 3 }); + v1[1] = @shuffle(f32, mt[1], undefined, [4]i32{ 0, 0, 1, 1 }); v0[2] = @shuffle(f32, mt[2], mt[0], [4]i32{ 1, 3, ~@as(i32, 1), ~@as(i32, 3) }); v1[2] = @shuffle(f32, mt[3], mt[1], [4]i32{ 0, 2, ~@as(i32, 0), ~@as(i32, 2) }); @@ -2591,13 +2591,13 @@ pub fn inverseDet(m: Mat, out_det: ?*F32x4) Mat { d1 = mulAdd(-v0[1], v1[1], d1); d2 = mulAdd(-v0[2], v1[2], d2); - v0[0] = swizzle(mt[1], .y, .z, .x, .y); + v0[0] = @shuffle(f32, mt[1], undefined, [4]i32{ 1, 2, 0, 1 }); v1[0] = @shuffle(f32, d0, d2, [4]i32{ ~@as(i32, 1), 1, 3, 0 }); - v0[1] = swizzle(mt[0], .z, .x, .y, .x); + v0[1] = @shuffle(f32, mt[0], undefined, [4]i32{ 2, 0, 1, 0 }); v1[1] = @shuffle(f32, d0, d2, [4]i32{ 3, ~@as(i32, 1), 1, 2 }); - v0[2] = swizzle(mt[3], .y, .z, .x, .y); + v0[2] = @shuffle(f32, mt[3], undefined, [4]i32{ 1, 2, 0, 1 }); v1[2] = @shuffle(f32, d1, d2, [4]i32{ ~@as(i32, 3), 1, 3, 0 }); - v0[3] = swizzle(mt[2], .z, .x, .y, .x); + v0[3] = @shuffle(f32, mt[2], undefined, [4]i32{ 2, 0, 1, 0 }); v1[3] = @shuffle(f32, d1, d2, [4]i32{ 3, ~@as(i32, 3), 1, 2 }); var c0 = v0[0] * v1[0]; @@ -2605,13 +2605,13 @@ pub fn inverseDet(m: Mat, out_det: ?*F32x4) Mat { var c4 = v0[2] * v1[2]; var c6 = v0[3] * v1[3]; - v0[0] = swizzle(mt[1], .z, .w, .y, .z); + v0[0] = @shuffle(f32, mt[1], undefined, [4]i32{ 2, 3, 1, 2 }); v1[0] = @shuffle(f32, d0, d2, [4]i32{ 3, 0, 1, ~@as(i32, 0) }); - v0[1] = swizzle(mt[0], .w, .z, .w, .y); + v0[1] = @shuffle(f32, mt[0], undefined, [4]i32{ 3, 2, 3, 1 }); v1[1] = @shuffle(f32, d0, d2, [4]i32{ 2, 1, ~@as(i32, 0), 0 }); - v0[2] = swizzle(mt[3], .z, .w, .y, .z); + v0[2] = @shuffle(f32, mt[3], undefined, [4]i32{ 2, 3, 1, 2 }); v1[2] = @shuffle(f32, d1, d2, [4]i32{ 3, 0, 1, ~@as(i32, 2) }); - v0[3] = swizzle(mt[2], .w, .z, .w, .y); + v0[3] = @shuffle(f32, mt[2], undefined, [4]i32{ 3, 2, 3, 1 }); v1[3] = @shuffle(f32, d1, d2, [4]i32{ 2, 1, ~@as(i32, 2), 0 }); c0 = mulAdd(-v0[0], v1[0], c0); @@ -2619,13 +2619,13 @@ pub fn inverseDet(m: Mat, out_det: ?*F32x4) Mat { c4 = mulAdd(-v0[2], v1[2], c4); c6 = mulAdd(-v0[3], v1[3], c6); - v0[0] = swizzle(mt[1], .w, .x, .w, .x); + v0[0] = @shuffle(f32, mt[1], undefined, [4]i32{ 3, 0, 3, 0 }); v1[0] = @shuffle(f32, d0, d2, [4]i32{ 2, ~@as(i32, 1), ~@as(i32, 0), 2 }); - v0[1] = swizzle(mt[0], .y, .w, .x, .z); + v0[1] = @shuffle(f32, mt[0], undefined, [4]i32{ 1, 3, 0, 2 }); v1[1] = @shuffle(f32, d0, d2, [4]i32{ ~@as(i32, 1), 0, 3, ~@as(i32, 0) }); - v0[2] = swizzle(mt[3], .w, .x, .w, .x); + v0[2] = @shuffle(f32, mt[3], undefined, [4]i32{ 3, 0, 3, 0 }); v1[2] = @shuffle(f32, d1, d2, [4]i32{ 2, ~@as(i32, 3), ~@as(i32, 2), 2 }); - v0[3] = swizzle(mt[2], .y, .w, .x, .z); + v0[3] = @shuffle(f32, mt[2], undefined, [4]i32{ 1, 3, 0, 2 }); v1[3] = @shuffle(f32, d1, d2, [4]i32{ ~@as(i32, 3), 0, 3, ~@as(i32, 2) }); const c1 = mulAdd(-v0[0], v1[0], c0); @@ -2690,8 +2690,8 @@ pub fn matFromNormAxisAngle(axis: Vec, angle: f32) Mat { const c1 = splat(F32x4, sincos_angle[1]); const c0 = splat(F32x4, sincos_angle[0]); - const n0 = swizzle(axis, .y, .z, .x, .w); - const n1 = swizzle(axis, .z, .x, .y, .w); + const n0 = @shuffle(f32, axis, undefined, [4]i32{ 1, 2, 0, 3 }); + const n1 = @shuffle(f32, axis, undefined, [4]i32{ 2, 0, 1, 3 }); var v0 = c2 * n0 * n1; const r0 = c2 * axis * axis + c1; @@ -2701,19 +2701,19 @@ pub fn matFromNormAxisAngle(axis: Vec, angle: f32) Mat { v0 = andInt(r0, f32x4_mask3); var v1 = @shuffle(f32, r1, r2, [4]i32{ 0, 2, ~@as(i32, 1), ~@as(i32, 2) }); - v1 = swizzle(v1, .y, .z, .w, .x); + v1 = @shuffle(f32, v1, undefined, [4]i32{ 1, 2, 3, 0 }); var v2 = @shuffle(f32, r1, r2, [4]i32{ 1, 1, ~@as(i32, 0), ~@as(i32, 0) }); - v2 = swizzle(v2, .x, .z, .x, .z); + v2 = @shuffle(f32, v2, undefined, [4]i32{ 0, 2, 0, 2 }); r2 = @shuffle(f32, v0, v1, [4]i32{ 0, 3, ~@as(i32, 0), ~@as(i32, 1) }); - r2 = swizzle(r2, .x, .z, .w, .y); + r2 = @shuffle(f32, r2, undefined, [4]i32{ 0, 2, 3, 1 }); var m: Mat = undefined; m[0] = r2; r2 = @shuffle(f32, v0, v1, [4]i32{ 1, 3, ~@as(i32, 2), ~@as(i32, 3) }); - r2 = swizzle(r2, .z, .x, .w, .y); + r2 = @shuffle(f32, r2, undefined, [4]i32{ 2, 0, 3, 1 }); m[1] = r2; v2 = @shuffle(f32, v2, v0, [4]i32{ 0, 1, ~@as(i32, 2), ~@as(i32, 3) }); @@ -2758,38 +2758,38 @@ pub fn matFromQuat(quat: Quat) Mat { const q0 = quat + quat; var q1 = quat * q0; - var v0 = swizzle(q1, .y, .x, .x, .w); + var v0 = @shuffle(f32, q1, undefined, [4]i32{ 1, 0, 0, 3 }); v0 = andInt(v0, f32x4_mask3); - var v1 = swizzle(q1, .z, .z, .y, .w); + var v1 = @shuffle(f32, q1, undefined, [4]i32{ 2, 2, 1, 3 }); v1 = andInt(v1, f32x4_mask3); const r0 = (f32x4(1.0, 1.0, 1.0, 0.0) - v0) - v1; - v0 = swizzle(quat, .x, .x, .y, .w); - v1 = swizzle(q0, .z, .y, .z, .w); + v0 = @shuffle(f32, quat, undefined, [4]i32{ 0, 0, 1, 3 }); + v1 = @shuffle(f32, q0, undefined, [4]i32{ 2, 1, 2, 3 }); v0 = v0 * v1; - v1 = swizzle(quat, .w, .w, .w, .w); - const v2 = swizzle(q0, .y, .z, .x, .w); + v1 = @shuffle(f32, quat, undefined, [4]i32{ 3, 3, 3, 3 }); + const v2 = @shuffle(f32, q0, undefined, [4]i32{ 1, 2, 0, 3 }); v1 = v1 * v2; const r1 = v0 + v1; const r2 = v0 - v1; v0 = @shuffle(f32, r1, r2, [4]i32{ 1, 2, ~@as(i32, 0), ~@as(i32, 1) }); - v0 = swizzle(v0, .x, .z, .w, .y); + v0 = @shuffle(f32, v0, undefined, [4]i32{ 0, 2, 3, 1 }); v1 = @shuffle(f32, r1, r2, [4]i32{ 0, 0, ~@as(i32, 2), ~@as(i32, 2) }); - v1 = swizzle(v1, .x, .z, .x, .z); + v1 = @shuffle(f32, v1, undefined, [4]i32{ 0, 2, 0, 2 }); q1 = @shuffle(f32, r0, v0, [4]i32{ 0, 3, ~@as(i32, 0), ~@as(i32, 1) }); - q1 = swizzle(q1, .x, .z, .w, .y); + q1 = @shuffle(f32, q1, undefined, [4]i32{ 0, 2, 3, 1 }); var m: Mat = undefined; m[0] = q1; q1 = @shuffle(f32, r0, v0, [4]i32{ 1, 3, ~@as(i32, 2), ~@as(i32, 3) }); - q1 = swizzle(q1, .z, .x, .w, .y); + q1 = @shuffle(f32, q1, undefined, [4]i32{ 2, 0, 3, 1 }); m[1] = q1; q1 = @shuffle(f32, v1, r0, [4]i32{ 0, 1, ~@as(i32, 2), ~@as(i32, 3) }); @@ -2902,17 +2902,17 @@ pub inline fn matToArr34(m: Mat) [12]f32 { // // ------------------------------------------------------------------------------ pub fn qmul(q0: Quat, q1: Quat) Quat { - var result = swizzle(q1, .w, .w, .w, .w); - var q1x = swizzle(q1, .x, .x, .x, .x); - var q1y = swizzle(q1, .y, .y, .y, .y); - var q1z = swizzle(q1, .z, .z, .z, .z); + var result = @shuffle(f32, q1, undefined, [4]i32{ 3, 3, 3, 3 }); + var q1x = @shuffle(f32, q1, undefined, [4]i32{ 0, 0, 0, 0 }); + var q1y = @shuffle(f32, q1, undefined, [4]i32{ 1, 1, 1, 1 }); + var q1z = @shuffle(f32, q1, undefined, [4]i32{ 2, 2, 2, 2 }); result = result * q0; - var q0_shuf = swizzle(q0, .w, .z, .y, .x); + var q0_shuf = @shuffle(f32, q0, undefined, [4]i32{ 3, 2, 1, 0 }); q1x = q1x * q0_shuf; - q0_shuf = swizzle(q0_shuf, .y, .x, .w, .z); + q0_shuf = @shuffle(f32, q0_shuf, undefined, [4]i32{ 1, 0, 3, 2 }); result = mulAdd(q1x, f32x4(1.0, -1.0, 1.0, -1.0), result); q1y = q1y * q0_shuf; - q0_shuf = swizzle(q0_shuf, .w, .z, .y, .x); + q0_shuf = @shuffle(f32, q0_shuf, undefined, [4]i32{ 3, 2, 1, 0 }); q1y = q1y * f32x4(1.0, 1.0, -1.0, -1.0); q1z = q1z * q0_shuf; q1y = mulAdd(q1z, f32x4(-1.0, 1.0, 1.0, -1.0), q1y); @@ -2951,9 +2951,9 @@ pub fn quatFromMat(m: Mat) Quat { const r0 = m[0]; const r1 = m[1]; const r2 = m[2]; - const r00 = swizzle(r0, .x, .x, .x, .x); - const r11 = swizzle(r1, .y, .y, .y, .y); - const r22 = swizzle(r2, .z, .z, .z, .z); + const r00 = @shuffle(f32, r0, undefined, [4]i32{ 0, 0, 0, 0 }); + const r11 = @shuffle(f32, r1, undefined, [4]i32{ 1, 1, 1, 1 }); + const r22 = @shuffle(f32, r2, undefined, [4]i32{ 2, 2, 2, 2 }); const x2gey2 = (r11 - r00) <= splat(F32x4, 0.0); const z2gew2 = (r11 + r00) <= splat(F32x4, 0.0); @@ -2966,12 +2966,12 @@ pub fn quatFromMat(m: Mat) Quat { t0 = @shuffle(f32, r0, r1, [4]i32{ 1, 2, ~@as(i32, 2), ~@as(i32, 1) }); t1 = @shuffle(f32, r1, r2, [4]i32{ 0, 0, ~@as(i32, 0), ~@as(i32, 1) }); - t1 = swizzle(t1, .x, .z, .w, .y); + t1 = @shuffle(f32, t1, undefined, [4]i32{ 0, 2, 3, 1 }); const xyxzyz = t0 + t1; t0 = @shuffle(f32, r2, r1, [4]i32{ 1, 0, ~@as(i32, 0), ~@as(i32, 0) }); t1 = @shuffle(f32, r1, r0, [4]i32{ 2, 2, ~@as(i32, 2), ~@as(i32, 1) }); - t1 = swizzle(t1, .x, .z, .w, .y); + t1 = @shuffle(f32, t1, undefined, [4]i32{ 0, 2, 3, 1 }); const xwywzw = (t0 - t1) * f32x4(-1.0, 1.0, -1.0, 1.0); t0 = @shuffle(f32, x2y2z2w2, xyxzyz, [4]i32{ 0, 1, ~@as(i32, 0), ~@as(i32, 0) }); @@ -3100,8 +3100,8 @@ pub fn slerpV(q0: Quat, q1: Quat, t: F32x4) Quat { var s0 = sin(v01 * omega) / sin_omega; s0 = select(cos_omega < splat(F32x4, 1.0 - 0.00001), s0, v01); - const s1 = swizzle(s0, .y, .y, .y, .y); - s0 = swizzle(s0, .x, .x, .x, .x); + const s1 = @shuffle(f32, s0, undefined, [4]i32{ 1, 1, 1, 1 }); + s0 = @shuffle(f32, s0, undefined, [4]i32{ 0, 0, 0, 0 }); return q0 * s0 + sign * q1 * s1; } @@ -3117,7 +3117,7 @@ test "zmath.quaternion.slerp" { pub fn quatToRollPitchYaw(q: Quat) [3]f32 { var angles: [3]f32 = undefined; - const p = swizzle(q, .w, .y, .x, .z); + const p = @shuffle(f32, q, undefined, [4]i32{ 3, 1, 0, 2 }); const sign = -1.0; const singularity = p[0] * p[2] + sign * p[1] * p[3]; @@ -3231,9 +3231,9 @@ pub fn adjustContrast(color: F32x4, contrast: f32) F32x4 { } pub fn rgbToHsl(rgb: F32x4) F32x4 { - const r = swizzle(rgb, .x, .x, .x, .x); - const g = swizzle(rgb, .y, .y, .y, .y); - const b = swizzle(rgb, .z, .z, .z, .z); + const r = @shuffle(f32, rgb, undefined, [4]i32{ 0, 0, 0, 0 }); + const g = @shuffle(f32, rgb, undefined, [4]i32{ 1, 1, 1, 1 }); + const b = @shuffle(f32, rgb, undefined, [4]i32{ 2, 2, 2, 2 }); const minv = min(r, min(g, b)); const maxv = max(r, max(g, b)); @@ -3305,13 +3305,13 @@ fn hueToClr(p: F32x4, q: F32x4, h: F32x4) F32x4 { } pub fn hslToRgb(hsl: F32x4) F32x4 { - const s = swizzle(hsl, .y, .y, .y, .y); - const l = swizzle(hsl, .z, .z, .z, .z); + const s = @shuffle(f32, hsl, undefined, [4]i32{ 1, 1, 1, 1 }); + const l = @shuffle(f32, hsl, undefined, [4]i32{ 2, 2, 2, 2 }); if (all(isNearEqual(s, f32x4s(0.0), f32x4s(math.floatEps(f32))), 3)) { return select(boolx4(true, true, true, false), l, hsl); } else { - const h = swizzle(hsl, .x, .x, .x, .x); + const h = @shuffle(f32, hsl, undefined, [4]i32{ 0, 0, 0, 0 }); var q: F32x4 = undefined; if (all(l < f32x4s(0.5), 3)) { q = l * (f32x4s(1.0) + s); @@ -3361,9 +3361,9 @@ test "zmath.color.hslToRgb" { } pub fn rgbToHsv(rgb: F32x4) F32x4 { - const r = swizzle(rgb, .x, .x, .x, .x); - const g = swizzle(rgb, .y, .y, .y, .y); - const b = swizzle(rgb, .z, .z, .z, .z); + const r = @shuffle(f32, rgb, undefined, [4]i32{ 0, 0, 0, 0 }); + const g = @shuffle(f32, rgb, undefined, [4]i32{ 1, 1, 1, 1 }); + const b = @shuffle(f32, rgb, undefined, [4]i32{ 2, 2, 2, 2 }); const minv = min(r, min(g, b)); const v = max(r, max(g, b)); @@ -3404,9 +3404,9 @@ test "zmath.color.rgbToHsv" { } pub fn hsvToRgb(hsv: F32x4) F32x4 { - const h = swizzle(hsv, .x, .x, .x, .x); - const s = swizzle(hsv, .y, .y, .y, .y); - const v = swizzle(hsv, .z, .z, .z, .z); + const h = @shuffle(f32, hsv, undefined, [4]i32{ 0, 0, 0, 0 }); + const s = @shuffle(f32, hsv, undefined, [4]i32{ 1, 1, 1, 1 }); + const v = @shuffle(f32, hsv, undefined, [4]i32{ 2, 2, 2, 2 }); const h6 = h * f32x4s(6.0); const i = floor(h6); @@ -3790,21 +3790,21 @@ pub fn cmulSoa(re0: anytype, im0: anytype, re1: anytype, im1: anytype) [2]@TypeO // // ------------------------------------------------------------------------------ fn fftButterflyDit4_1(re0: *F32x4, im0: *F32x4) void { - const re0l = swizzle(re0.*, .x, .x, .y, .y); - const re0h = swizzle(re0.*, .z, .z, .w, .w); + const re0l = @shuffle(f32, re0.*, undefined, [4]i32{ 0, 0, 1, 1 }); + const re0h = @shuffle(f32, re0.*, undefined, [4]i32{ 2, 2, 3, 3 }); - const im0l = swizzle(im0.*, .x, .x, .y, .y); - const im0h = swizzle(im0.*, .z, .z, .w, .w); + const im0l = @shuffle(f32, im0.*, undefined, [4]i32{ 0, 0, 1, 1 }); + const im0h = @shuffle(f32, im0.*, undefined, [4]i32{ 2, 2, 3, 3 }); const re_temp = mulAdd(re0h, f32x4(1.0, -1.0, 1.0, -1.0), re0l); const im_temp = mulAdd(im0h, f32x4(1.0, -1.0, 1.0, -1.0), im0l); const re_shuf0 = @shuffle(f32, re_temp, im_temp, [4]i32{ 2, 3, ~@as(i32, 2), ~@as(i32, 3) }); - const re_shuf = swizzle(re_shuf0, .x, .w, .x, .w); - const im_shuf = swizzle(re_shuf0, .z, .y, .z, .y); + const re_shuf = @shuffle(f32, re_shuf0, undefined, [4]i32{ 0, 3, 0, 3 }); + const im_shuf = @shuffle(f32, re_shuf0, undefined, [4]i32{ 2, 1, 2, 1 }); - const re_templ = swizzle(re_temp, .x, .y, .x, .y); - const im_templ = swizzle(im_temp, .x, .y, .x, .y); + const re_templ = @shuffle(f32, re_temp, undefined, [4]i32{ 0, 1, 0, 1 }); + const im_templ = @shuffle(f32, im_temp, undefined, [4]i32{ 0, 1, 0, 1 }); re0.* = mulAdd(re_shuf, f32x4(1.0, 1.0, -1.0, -1.0), re_templ); im0.* = mulAdd(im_shuf, f32x4(1.0, -1.0, -1.0, 1.0), im_templ); From 351559aedeedfd2c98417af4c1ca9fa3b6b3a721 Mon Sep 17 00:00:00 2001 From: Daniel Murphy Date: Mon, 15 Jul 2024 07:45:13 -0700 Subject: [PATCH 3/6] using std.simd.iota --- libs/zmath/src/zmath.zig | 16 ++-------------- 1 file changed, 2 insertions(+), 14 deletions(-) diff --git a/libs/zmath/src/zmath.zig b/libs/zmath/src/zmath.zig index 42b453cfd..e40d56f30 100644 --- a/libs/zmath/src/zmath.zig +++ b/libs/zmath/src/zmath.zig @@ -480,13 +480,7 @@ pub fn all(vb: anytype, comptime len: u32) bool { if (lenOrVecLen == veclen(T)) { return @reduce(.And, vb); } - const resizeMask = comptime blk: { - var mask: [len]i32 = undefined; - for (0..len) |i| { - mask[i] = i; - } - break :blk mask; - }; + const resizeMask = std.simd.iota(i32, lenOrVecLen); const resized = @shuffle(childType, vb, undefined, resizeMask); return @reduce(.And, resized); } @@ -529,13 +523,7 @@ pub fn any(vb: anytype, comptime len: u32) bool { if (lenOrVecLen == veclen(T)) { return @reduce(.Or, vb); } - const resizeMask = comptime blk: { - var mask: [len]i32 = undefined; - for (0..len) |i| { - mask[i] = i; - } - break :blk mask; - }; + const resizeMask = std.simd.iota(i32, lenOrVecLen); const resized = @shuffle(childType, vb, undefined, resizeMask); return @reduce(.Or, resized); } From 0bdda7d0c0015bcd10b40be17e745b1af7c27024 Mon Sep 17 00:00:00 2001 From: Daniel Murphy Date: Mon, 15 Jul 2024 07:56:59 -0700 Subject: [PATCH 4/6] updated benchmark data --- libs/zmath/src/benchmark.zig | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/libs/zmath/src/benchmark.zig b/libs/zmath/src/benchmark.zig index 39ec3ddde..7f78a20e7 100644 --- a/libs/zmath/src/benchmark.zig +++ b/libs/zmath/src/benchmark.zig @@ -22,13 +22,13 @@ // wave benchmark (SOA) - scalar version: 3.6598s, zmath version: 0.4231s // // ------------------------------------------------------------------------------------------------- -// 'Apple M1 Max', macOS Version 12.4, Zig 0.10.0-dev.2657+74442f350, ReleaseFast +// 'Apple M1 Max', macOS Version 12.5, Zig 0.13.0, ReleaseFast // ------------------------------------------------------------------------------------------------- -// matrix mul benchmark (AOS) - scalar version: 1.0297s, zmath version: 1.0538s -// cross3, scale, bias benchmark (AOS) - scalar version: 0.6294s, zmath version: 0.6532s -// cross3, dot3, scale, bias benchmark (AOS) - scalar version: 0.9807s, zmath version: 1.0988s -// quaternion mul benchmark (AOS) - scalar version: 1.5413s, zmath version: 0.7800s -// wave benchmark (SOA) - scalar version: 3.4220s, zmath version: 1.0255s +// matrix mul benchmark (AOS) - scalar version: 1.0012s, zmath version: 0.9848s +// cross3, scale, bias benchmark (AOS) - scalar version: 0.6222s, zmath version: 0.6437s +// cross3, dot3, scale, bias benchmark (AOS) - scalar version: 0.9776s, zmath version: 0.9504s +// quaternion mul benchmark (AOS) - scalar version: 0.9808s, zmath version: 0.7998s +// wave benchmark (SOA) - scalar version: 3.3917s, zmath version: 1.0295 // // ------------------------------------------------------------------------------------------------- // '11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz', Windows 11, Zig 0.10.0-dev.2620+0e9458a3f, ReleaseFast From 51db2ccd3f341c605bffeeeb9cb6224084a49200 Mon Sep 17 00:00:00 2001 From: Daniel Murphy Date: Mon, 15 Jul 2024 07:58:30 -0700 Subject: [PATCH 5/6] whoops --- libs/zmath/src/benchmark.zig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/libs/zmath/src/benchmark.zig b/libs/zmath/src/benchmark.zig index 7f78a20e7..85d3c18db 100644 --- a/libs/zmath/src/benchmark.zig +++ b/libs/zmath/src/benchmark.zig @@ -22,7 +22,7 @@ // wave benchmark (SOA) - scalar version: 3.6598s, zmath version: 0.4231s // // ------------------------------------------------------------------------------------------------- -// 'Apple M1 Max', macOS Version 12.5, Zig 0.13.0, ReleaseFast +// 'Apple M1 Pro', macOS Version 12.5, Zig 0.13.0, ReleaseFast // ------------------------------------------------------------------------------------------------- // matrix mul benchmark (AOS) - scalar version: 1.0012s, zmath version: 0.9848s // cross3, scale, bias benchmark (AOS) - scalar version: 0.6222s, zmath version: 0.6437s From c18ed5b002132239d7cc19ec4d08317e37892de5 Mon Sep 17 00:00:00 2001 From: Daniel Murphy Date: Wed, 17 Jul 2024 19:52:52 -0700 Subject: [PATCH 6/6] revert and fix --- libs/zmath/src/benchmark.zig | 10 ++--- libs/zmath/src/zmath.zig | 72 ++++++++++++++++++++++++++---------- 2 files changed, 58 insertions(+), 24 deletions(-) diff --git a/libs/zmath/src/benchmark.zig b/libs/zmath/src/benchmark.zig index 85d3c18db..ee22c7c7d 100644 --- a/libs/zmath/src/benchmark.zig +++ b/libs/zmath/src/benchmark.zig @@ -24,11 +24,11 @@ // ------------------------------------------------------------------------------------------------- // 'Apple M1 Pro', macOS Version 12.5, Zig 0.13.0, ReleaseFast // ------------------------------------------------------------------------------------------------- -// matrix mul benchmark (AOS) - scalar version: 1.0012s, zmath version: 0.9848s -// cross3, scale, bias benchmark (AOS) - scalar version: 0.6222s, zmath version: 0.6437s -// cross3, dot3, scale, bias benchmark (AOS) - scalar version: 0.9776s, zmath version: 0.9504s -// quaternion mul benchmark (AOS) - scalar version: 0.9808s, zmath version: 0.7998s -// wave benchmark (SOA) - scalar version: 3.3917s, zmath version: 1.0295 +// matrix mul benchmark (AOS) - scalar version: 0.9970s, zmath version: 0.9777s +// cross3, scale, bias benchmark (AOS) - scalar version: 0.6250s, zmath version: 0.6423s +// cross3, dot3, scale, bias benchmark (AOS) - scalar version: 0.9822s, zmath version: 0.9717s +// quaternion mul benchmark (AOS) - scalar version: 0.9872s, zmath version: 0.7767s +// wave benchmark (SOA) - scalar version: 3.3965s, zmath version: 1.0280s // // ------------------------------------------------------------------------------------------------- // '11th Gen Intel(R) Core(TM) i7-11800H @ 2.30GHz', Windows 11, Zig 0.10.0-dev.2620+0e9458a3f, ReleaseFast diff --git a/libs/zmath/src/zmath.zig b/libs/zmath/src/zmath.zig index e40d56f30..58c537a6b 100644 --- a/libs/zmath/src/zmath.zig +++ b/libs/zmath/src/zmath.zig @@ -465,7 +465,8 @@ pub inline fn vecToArr4(v: Vec) [4]f32 { } // ------------------------------------------------------------------------------ // -// 2. Functions that work on all vector components (F32xN = F32x4 or F32x8 or F32x16) +// 2. Functions that work on all float vector components (F32xN = F32x4 or F32x8 +// or F32x16), as well as boolxN and integer vectors. // // ------------------------------------------------------------------------------ pub fn all(vb: anytype, comptime len: u32) bool { @@ -476,13 +477,24 @@ pub fn all(vb: anytype, comptime len: u32) bool { const lenOrVecLen = comptime if (len == 0) veclen(T) else len; // Handle int and bool types that can use @reduce. const childType = @typeInfo(T).Vector.child; - if (childType == bool or childType == std.builtin.Type.Int) { - if (lenOrVecLen == veclen(T)) { - return @reduce(.And, vb); - } - const resizeMask = std.simd.iota(i32, lenOrVecLen); - const resized = @shuffle(childType, vb, undefined, resizeMask); - return @reduce(.And, resized); + switch (@typeInfo(childType)) { + .Int => { + if (lenOrVecLen == veclen(T)) { + return @reduce(.And, vb) != 0; + } + const resizeMask = std.simd.iota(i32, lenOrVecLen); + const resized = @shuffle(childType, vb, undefined, resizeMask); + return @reduce(.And, resized) != 0; + }, + .Bool => { + if (lenOrVecLen == veclen(T)) { + return @reduce(.And, vb); + } + const resizeMask = std.simd.iota(i32, lenOrVecLen); + const resized = @shuffle(childType, vb, undefined, resizeMask); + return @reduce(.And, resized); + }, + else => {}, } // Float vectors don't support '.And', so manually loop. comptime var i: u32 = 0; @@ -510,6 +522,9 @@ test "zmath.all" { try expect(all(f32x4(0, 0, 0, 0), 0) == false); try expect(all(f32x4(0, 0, 0, 1), 1) == false); try expect(all(f32x4(1, 0, 0, 0), 1) == true); + try expect(all(@Vector(4, i32){ 0, 0, 0, 1 }, 1) == false); + try expect(all(@Vector(4, i32){ 1, 1, 0, 1 }, 1) == true); + try expect(all(@Vector(4, i32){ 1, 1, 0, 1 }, 0) == false); } pub fn any(vb: anytype, comptime len: u32) bool { @@ -519,13 +534,24 @@ pub fn any(vb: anytype, comptime len: u32) bool { } const lenOrVecLen = comptime if (len == 0) veclen(T) else len; const childType = @typeInfo(T).Vector.child; - if (childType == bool or childType == std.builtin.Type.Int) { - if (lenOrVecLen == veclen(T)) { - return @reduce(.Or, vb); - } - const resizeMask = std.simd.iota(i32, lenOrVecLen); - const resized = @shuffle(childType, vb, undefined, resizeMask); - return @reduce(.Or, resized); + switch (@typeInfo(childType)) { + .Int => { + if (lenOrVecLen == veclen(T)) { + return @reduce(.Or, vb) != 0; + } + const resizeMask = std.simd.iota(i32, lenOrVecLen); + const resized = @shuffle(childType, vb, undefined, resizeMask); + return @reduce(.Or, resized) != 0; + }, + .Bool => { + if (lenOrVecLen == veclen(T)) { + return @reduce(.Or, vb); + } + const resizeMask = std.simd.iota(i32, lenOrVecLen); + const resized = @shuffle(childType, vb, undefined, resizeMask); + return @reduce(.Or, resized); + }, + else => {}, } comptime var i: u32 = 0; var result = false; @@ -542,6 +568,9 @@ test "zmath.any" { try expect(any(f32x4(0, 0, 0, 0), 0) == false); try expect(any(f32x4(1, 0, 0, 1), 1) == true); try expect(any(f32x4(0, 0, 0, 1), 1) == false); + try expect(any(@Vector(4, i32){ 1, 0, 0, 1 }, 1) == false); + try expect(any(@Vector(4, i32){ 0, 1, 0, 1 }, 1) == false); + try expect(all(@Vector(4, i32){ 0, 1, 0, 1 }, 0) == true); } pub inline fn isNearEqual( @@ -1939,8 +1968,10 @@ test "zmath.atan2" { // // ------------------------------------------------------------------------------ pub inline fn dot2(v0: Vec, v1: Vec) F32x4 { - const xmm0 = v0 * v1; - return @splat(xmm0[0] + xmm0[1]); + var xmm0 = v0 * v1; // | x0*x1 | y0*y1 | -- | -- | + const xmm1 = swizzle(xmm0, .y, .x, .x, .x); // | y0*y1 | -- | -- | -- | + xmm0 = f32x4(xmm0[0] + xmm1[0], xmm0[1], xmm0[2], xmm0[3]); // | x0*x1 + y0*y1 | -- | -- | -- | + return swizzle(xmm0, .x, .x, .x, .x); } test "zmath.dot2" { const v0 = f32x4(-1.0, 2.0, 300.0, -2.0); @@ -1961,8 +1992,11 @@ test "zmath.dot3" { } pub inline fn dot4(v0: Vec, v1: Vec) F32x4 { - const xmm0 = v0 * v1; // | x0*x1 | y0*y1 | z0*z1 | w0*w1 | - return @splat(xmm0[0] + xmm0[1] + xmm0[2] + xmm0[3]); + var xmm0 = v0 * v1; // | x0*x1 | y0*y1 | z0*z1 | w0*w1 | + var xmm1 = @shuffle(f32, xmm0, undefined, [4]i32{ 1, 0, 3, 2 }); // | y0*y1 | -- | w0*w1 | -- | + xmm1 = xmm0 + xmm1; // | x0*x1 + y0*y1 | x0*x1 + y0*y1 | z0*z1 + w0*w1 | z0*z1 + w0*w1 | + xmm0 = @shuffle(f32, xmm1, undefined, [4]i32{ 3, 2, 1, 0 }); // | z0*z1 + w0*w1 | z0*z1 + w0*w1 | x0*x1 + y0*y1 | x0*x1 + y0*y1 | + xmm0 = xmm0 + xmm1; } test "zmath.dot4" { const v0 = f32x4(-1.0, 2.0, 3.0, -2.0); pFad - Phonifier reborn

Pfad - The Proxy pFad of © 2024 Garber Painting. All rights reserved.

Note: This service is not intended for secure transactions such as banking, social media, email, or purchasing. Use at your own risk. We assume no liability whatsoever for broken pages.


Alternative Proxies:

Alternative Proxy

pFad Proxy

pFad v3 Proxy

pFad v4 Proxy