Skip to content

Commit

Permalink
add a cosine similarity analysis cos_similarity()
Browse files Browse the repository at this point in the history
  • Loading branch information
hhaensel committed Jan 6, 2021
1 parent 992596b commit 7b3ce8c
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 10 deletions.
10 changes: 5 additions & 5 deletions src/TextAnalysis.jl
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ module TextAnalysis
export hash_dtv, each_hash_dtv, hash_dtm, hash_tdm
export CooMatrix, coom
export standardize!
export tf, tf_idf, bm_25, lsa, lda, summarize
export tf, tf_idf, bm_25, lsa, lda, summarize, cos_similarity
export tf!, tf_idf!, bm_25!, lda!
export remove_patterns!, remove_patterns
export prune!
Expand All @@ -61,7 +61,7 @@ module TextAnalysis
export tag_scheme!
export rouge_l_summary, rouge_l_sentence, rouge_n
export PerceptronTagger, fit!, predict

export Vocabulary, lookup, update
export everygram, padding_ngram
export maskedscore, logscore, entropy, perplexity
Expand Down Expand Up @@ -93,14 +93,14 @@ module TextAnalysis
include("coom.jl")



# Lang_model
include("LM/vocab.jl")
include("LM/langmodel.jl")
include("LM/langmodel.jl")
include("LM/api.jl")
include("LM/counter.jl")
include("LM/preprocessing.jl")



function __init__()
Expand Down
30 changes: 30 additions & 0 deletions src/tf_idf.jl
Original file line number Diff line number Diff line change
Expand Up @@ -301,3 +301,33 @@ function tf_bm25!(dtm::AbstractMatrix{T}, tf::AbstractMatrix{F}
end
return tf
end

"""
function cos_similarity(tfidf::AbstractMatrix)
`cos_similarity` calculates the cosine similarity from a term matrix (typically the tf-idf matrix).
# Example
```
crps = Corpus( StringDocument.([
"to be or not to be",
"to sing or not to sing",
"to talk or to silence"]) )
update_lexicon!(crps)
d = dtm(crps)
tfm = tf_idf(d)
cs = cos_similarity(tfm)
Matrix(cs)
# 3×3 Array{Float64,2}:
# 1.0 0.0329318 0.0
# 0.0329318 1.0 0.0
# 0.0 0.0 1.0
```
"""
function cos_similarity(tfm::AbstractMatrix)
cs = tfm * tfm'
d = sqrt.(diag(cs))
# prevent division by zero (only occurs for empty documents)
d[findall(iszero, d)] .= 1
cs ./ (d * d')
end
23 changes: 18 additions & 5 deletions test/tf_idf.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,10 +6,7 @@
doc3 = ""
doc4 = "another another text text text text"

# TODO: this should work!
# crps = Corpus(map(StringDocument, [doc1 doc2 doc3 doc4]))

crps = Corpus(Any[StringDocument(doc1), StringDocument(doc2), StringDocument(doc3), StringDocument(doc4)])
crps = Corpus(StringDocument.([doc1, doc2, doc3, doc4]))

update_lexicon!(crps)
m = DocumentTermMatrix(crps)
Expand Down Expand Up @@ -76,7 +73,7 @@
doc3 = ""
doc4 = "another another text text text text"

crps = Corpus(Any[StringDocument(doc1), StringDocument(doc2), StringDocument(doc3), StringDocument(doc4)])
crps = Corpus(StringDocument.([doc1, doc2, doc3, doc4]))

update_lexicon!(crps)
m = DocumentTermMatrix(crps)
Expand Down Expand Up @@ -120,4 +117,20 @@

@test_throws MethodError bm_25!(DocumentTermMatrix(crps))
end

@testset "cosine similarity `cos_similarity()`" begin
crps = Corpus( StringDocument.([
"to be or not to be",
"to sing or not to sing",
"to talk or to silence"]) )
update_lexicon!(crps)
d = dtm(crps)
tfm = tf_idf(d)
cs = cos_similarity(tfm)
@test cs==[
1.0 0.03293177886245518 0.0;
0.03293177886245518 1.0 0.0;
0.0 0.0 1.0
]
end
end

0 comments on commit 7b3ce8c

Please sign in to comment.