forked from james-bowman/nlp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdimreduction.go
213 lines (178 loc) · 6.76 KB
/
dimreduction.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
package nlp
import (
"encoding/binary"
"fmt"
"io"
"github.com/james-bowman/sparse"
"gonum.org/v1/gonum/mat"
"gonum.org/v1/gonum/stat"
)
// TruncatedSVD implements the Singular Value Decomposition factorisation of matrices.
// This produces an approximation of the input matrix at a lower rank. This is a core
// component of LSA (Latent Semantic Analsis)
type TruncatedSVD struct {
// Components is the truncated term matrix (matrix U of the Singular Value Decomposition
// (A=USV^T)). The matrix will be of size m, k where m = the number of unique terms
// in the training data and k = the number of elements to truncate to (specified by
// attribute K) or m or n (the number of documents in the training data) whichever of
// the 3 values is smaller.
Components *mat.Dense
// K is the number of dimensions to which the output, transformed, matrix should be
// truncated to. The matrix output by the FitTransform() and Transform() methods will
// be n rows by min(m, n, K) columns, where n is the number of columns in the original,
// input matrix and min(m, n, K) is the lowest value of m, n, K where m is the number of
// rows in the original, input matrix.
K int
}
// NewTruncatedSVD creates a new TruncatedSVD transformer with K (the truncated
// dimensionality) being set to the specified value k
func NewTruncatedSVD(k int) *TruncatedSVD {
return &TruncatedSVD{K: k}
}
// Fit performs the SVD factorisation on the input training data matrix, mat and
// stores the output term matrix as a transform to apply to matrices in the Transform matrix.
func (t *TruncatedSVD) Fit(mat mat.Matrix) Transformer {
if _, err := t.FitTransform(mat); err != nil {
panic("nlp: Failed to fit truncated SVD because " + err.Error())
}
return t
}
// Transform applies the transform decomposed from the training data matrix in Fit()
// to the input matrix. The resulting output matrix will be the closest approximation
// to the input matrix at a reduced rank. The returned matrix is a dense matrix type.
func (t *TruncatedSVD) Transform(m mat.Matrix) (mat.Matrix, error) {
var product mat.Dense
product.Mul(t.Components.T(), m)
return &product, nil
}
// FitTransform is approximately equivalent to calling Fit() followed by Transform()
// on the same matrix. This is a useful shortcut where separate training data is not being
// used to fit the model i.e. the model is fitted on the fly to the test data.
// The returned matrix is a dense matrix type.
func (t *TruncatedSVD) FitTransform(m mat.Matrix) (mat.Matrix, error) {
var svd mat.SVD
if ok := svd.Factorize(m, mat.SVDThin); !ok {
return nil, fmt.Errorf("Failed SVD Factorisation of working matrix")
}
s, u, v := t.extractSVD(&svd)
r, c := m.Dims()
min := minimum(t.K, r, c)
// truncate U and V matrices to k << min(m, n)
uk := u.Slice(0, r, 0, min)
vk := v.Slice(0, c, 0, min)
t.Components = uk.(*mat.Dense)
// multiply Sigma by transpose of V. As sigma is a symmetrical (square) diagonal matrix it is
// more efficient to simply multiply each element from the array of diagonal values with each
// element from the matrix V rather than multiplying out the non-zero values from off the diagonal.
var product mat.Dense
product.Apply(func(i, j int, v float64) float64 {
return (v * s[i])
}, vk.T())
return &product, nil
}
func minimum(k, m, n int) int {
return min(k, min(m, n))
}
func min(m, n int) int {
if m < n {
return m
}
return n
}
func (t *TruncatedSVD) extractSVD(svd *mat.SVD) (s []float64, u, v *mat.Dense) {
var um, vm mat.Dense
svd.UTo(&um)
svd.VTo(&vm)
s = svd.Values(nil)
return s, &um, &vm
}
// Save binary serialises the model and writes it into w. This is useful for persisting
// a trained model to disk so that it may be loaded (using the Load() method)in another
// context (e.g. production) for reproducible results.
func (t TruncatedSVD) Save(w io.Writer) error {
var buf [8]byte
binary.LittleEndian.PutUint64(buf[:], uint64(t.K))
if _, err := w.Write(buf[:]); err != nil {
return err
}
_, err := t.Components.MarshalBinaryTo(w)
return err
}
// Load binary deserialises the previously serialised model into the receiver. This is
// useful for loading a previously trained and saved model from another context
// (e.g. offline training) for use within another context (e.g. production) for
// reproducible results. Load should only be performed with trusted data.
func (t *TruncatedSVD) Load(r io.Reader) error {
var n int
var buf [8]byte
var err error
for n < len(buf) && err == nil {
var nn int
nn, err = r.Read(buf[n:])
n += nn
}
if err == io.EOF {
return io.ErrUnexpectedEOF
}
if err != nil {
return err
}
k := int(binary.LittleEndian.Uint64(buf[:]))
var model mat.Dense
if _, err := model.UnmarshalBinaryFrom(r); err != nil {
return err
}
t.K = k
t.Components = &model
return nil
}
// PCA calculates the principal components of a matrix, or the axis of greatest variance and
// then projects matrices onto those axis.
// See https://en.wikipedia.org/wiki/Principal_component_analysis for further details.
type PCA struct {
// K is the number of components
K int
pc *stat.PC
}
// NewPCA constructs a new Principal Component Analysis transformer to reduce the dimensionality,
// projecting matrices onto the axis of greatest variance
func NewPCA(k int) *PCA {
return &PCA{K: k, pc: &stat.PC{}}
}
// Fit calculates the principal component directions (axis of greatest variance) within the
// training data which can then be used to project matrices onto those principal components using
// the Transform() method.
func (p *PCA) Fit(m mat.Matrix) Transformer {
if ok := p.pc.PrincipalComponents(m.T(), nil); !ok {
panic("nlp: PCA analysis failed during fitting")
}
return p
}
// Transform projects the matrix onto the first K principal components calculated during training
// (the Fit() method). The returned matrix will be of reduced dimensionality compared to the input
// (K x c compared to r x c of the input).
func (p *PCA) Transform(m mat.Matrix) (mat.Matrix, error) {
r, _ := m.Dims()
//var proj mat.Dense
var proj sparse.CSR
var dst mat.Dense
p.pc.VectorsTo(&dst)
proj.Mul(m.T(), dst.Slice(0, r, 0, p.K))
// matrix is r x c (t x d)
// m.T() = c x r (d x t)
// slice c x K
// (ar x ac) * (br x bc) = ar x bc
// ac == br
return proj.T(), nil
}
// FitTransform is approximately equivalent to calling Fit() followed by Transform()
// on the same matrix. This is a useful shortcut where separate training data is not being
// used to fit the model i.e. the model is fitted on the fly to the test data.
func (p *PCA) FitTransform(m mat.Matrix) (mat.Matrix, error) {
return p.Fit(m).Transform(m)
}
// ExplainedVariance returns a slice of float64 values representing the variances of the
// principal component scores.
func (p *PCA) ExplainedVariance() []float64 {
return p.pc.VarsTo(nil)
}