forked from james-bowman/nlp
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathexample_test.go
63 lines (51 loc) · 4.66 KB
/
example_test.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
package nlp_test
import (
"fmt"
"github.com/james-bowman/nlp"
"github.com/james-bowman/nlp/measures/pairwise"
"gonum.org/v1/gonum/mat"
)
func Example() {
testCorpus := []string{
"The quick brown fox jumped over the lazy dog",
"hey diddle diddle, the cat and the fiddle",
"the cow jumped over the moon",
"the little dog laughed to see such fun",
"and the dish ran away with the spoon",
}
var stopWords = []string{"a", "about", "above", "above", "across", "after", "afterwards", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "amongst", "amoungst", "amount", "an", "and", "another", "any", "anyhow", "anyone", "anything", "anyway", "anywhere", "are", "around", "as", "at", "back", "be", "became", "because", "become", "becomes", "becoming", "been", "before", "beforehand", "behind", "being", "below", "beside", "besides", "between", "beyond", "bill", "both", "bottom", "but", "by", "call", "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de", "describe", "detail", "do", "done", "down", "due", "during", "each", "eg", "eight", "either", "eleven", "else", "elsewhere", "empty", "enough", "etc", "even", "ever", "every", "everyone", "everything", "everywhere", "except", "few", "fifteen", "fify", "fill", "find", "fire", "first", "five", "for", "former", "formerly", "forty", "found", "four", "from", "front", "full", "further", "get", "give", "go", "had", "has", "hasnt", "have", "he", "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon", "hers", "herself", "him", "himself", "his", "how", "however", "hundred", "ie", "if", "in", "inc", "indeed", "interest", "into", "is", "it", "its", "itself", "keep", "last", "latter", "latterly", "least", "less", "ltd", "made", "many", "may", "me", "meanwhile", "might", "mill", "mine", "more", "moreover", "most", "mostly", "move", "much", "must", "my", "myself", "name", "namely", "neither", "never", "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor", "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once", "one", "only", "onto", "or", "other", "others", "otherwise", "our", "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps", "please", "put", "rather", "re", "same", "see", "seem", "seemed", "seeming", "seems", "serious", "several", "she", "should", "show", "side", "since", "sincere", "six", "sixty", "so", "some", "somehow", "someone", "something", "sometime", "sometimes", "somewhere", "still", "such", "system", "take", "ten", "than", "that", "the", "their", "them", "themselves", "then", "thence", "there", "thereafter", "thereby", "therefore", "therein", "thereupon", "these", "they", "thickv", "thin", "third", "this", "those", "though", "three", "through", "throughout", "thru", "thus", "to", "together", "too", "top", "toward", "towards", "twelve", "twenty", "two", "un", "under", "until", "up", "upon", "us", "very", "via", "was", "we", "well", "were", "what", "whatever", "when", "whence", "whenever", "where", "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever", "whether", "which", "while", "whither", "who", "whoever", "whole", "whom", "whose", "why", "will", "with", "within", "without", "would", "yet", "you", "your", "yours", "yourself", "yourselves"}
query := "the brown fox ran around the dog"
vectoriser := nlp.NewCountVectoriser(stopWords...)
transformer := nlp.NewTfidfTransformer()
// set k (the number of dimensions following truncation) to 4
reducer := nlp.NewTruncatedSVD(4)
lsiPipeline := nlp.NewPipeline(vectoriser, transformer, reducer)
// Transform the corpus into an LSI fitting the model to the documents in the process
lsi, err := lsiPipeline.FitTransform(testCorpus...)
if err != nil {
fmt.Printf("Failed to process documents because %v", err)
return
}
// run the query through the same pipeline that was fitted to the corpus and
// to project it into the same dimensional space
queryVector, err := lsiPipeline.Transform(query)
if err != nil {
fmt.Printf("Failed to process documents because %v", err)
return
}
// iterate over document feature vectors (columns) in the LSI matrix and compare
// with the query vector for similarity. Similarity is determined by the difference
// between the angles of the vectors known as the cosine similarity
highestSimilarity := -1.0
var matched int
_, docs := lsi.Dims()
for i := 0; i < docs; i++ {
similarity := pairwise.CosineSimilarity(queryVector.(mat.ColViewer).ColView(0), lsi.(mat.ColViewer).ColView(i))
if similarity > highestSimilarity {
matched = i
highestSimilarity = similarity
}
}
fmt.Printf("Matched '%s'", testCorpus[matched])
// Output: Matched 'The quick brown fox jumped over the lazy dog'
}