diff --git a/src/FsMath/Matrix.fs b/src/FsMath/Matrix.fs index 62b2f23..1c33726 100644 --- a/src/FsMath/Matrix.fs +++ b/src/FsMath/Matrix.fs @@ -580,8 +580,13 @@ type Matrix<'T when 'T :> Numerics.INumber<'T> /// /// Computes row-vector v (length = mat.NumRows) times matrix mat (size = [NumRows × NumCols]), - /// returning a new vector of length mat.NumCols. Uses chunk-based SIMD with manual gather. + /// returning a new vector of length mat.NumCols. Uses SIMD-optimized weighted row summation. /// + /// + /// This implementation reorganizes v × M as a weighted sum of matrix rows: + /// result = v[0]*row0 + v[1]*row1 + ... + v[n-1]*row(n-1) + /// This exploits row-major storage for contiguous memory access and SIMD acceleration. + /// static member inline multiplyRowVector<'T when 'T :> Numerics.INumber<'T> and 'T : struct @@ -589,7 +594,7 @@ type Matrix<'T when 'T :> Numerics.INumber<'T> and 'T :> ValueType> (v: Vector<'T>) (mat: Matrix<'T>) : Vector<'T> = - + // 1) Dimension checks let n = mat.NumRows let m = mat.NumCols @@ -599,14 +604,45 @@ type Matrix<'T when 'T :> Numerics.INumber<'T> let result = Vector.zeroCreate<'T> m let data = mat.Data // row-major: element (i,j) is data.[i*m + j] - // O(n*m) nested loops (not SIMD seems to be faster in the row-layout matrix) - for j = 0 to m - 1 do - let mutable sum = 'T.Zero + // SIMD optimization: compute as weighted sum of rows + // result = v[0]*row0 + v[1]*row1 + ... + v[n-1]*row(n-1) + if Numerics.Vector.IsHardwareAccelerated && m >= Numerics.Vector<'T>.Count then + let simdWidth = Numerics.Vector<'T>.Count + let simdCount = m / simdWidth + let scalarStart = simdCount * simdWidth + + let resultSpan = result.AsSpan() + let resultSimd = MemoryMarshal.Cast<'T, Numerics.Vector<'T>>(resultSpan) + + // Process each row of the matrix for i = 0 to n - 1 do - sum <- sum + (v.[i] * data.[i*m + j]) - result.[j] <- sum + let weight = v.[i] + if weight <> 'T.Zero then // Skip zero weights + let rowOffset = i * m + let rowSpan = data.AsSpan(rowOffset, m) + let rowSimd = MemoryMarshal.Cast<'T, Numerics.Vector<'T>>(rowSpan) - result + // Broadcast the weight to all SIMD lanes + let weightVec = Numerics.Vector<'T>(weight) + + // SIMD: accumulate weight * row into result + for j = 0 to simdCount - 1 do + resultSimd.[j] <- resultSimd.[j] + weightVec * rowSimd.[j] + + // Scalar tail: process remaining elements + for j = scalarStart to m - 1 do + result.[j] <- result.[j] + weight * data.[rowOffset + j] + + result + else + // Fallback for small matrices or no SIMD: use original column-wise approach + for j = 0 to m - 1 do + let mutable sum = 'T.Zero + for i = 0 to n - 1 do + sum <- sum + (v.[i] * data.[i*m + j]) + result.[j] <- sum + + result ///