Skip to content

Commit

Permalink
Merge pull request #32 from kou/bm25-fix-document-length
Browse files Browse the repository at this point in the history
Fix document length value in BM25
  • Loading branch information
jpmckinney authored Sep 17, 2020
2 parents 60ff32b + a25c7e2 commit ebe8431
Show file tree
Hide file tree
Showing 2 changed files with 13 additions and 9 deletions.
8 changes: 6 additions & 2 deletions lib/tf-idf-similarity/bm25_model.rb
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,12 @@ def inverse_document_frequency(term)
#
# @note Like Lucene, we use a b value of 0.75 and a k1 value of 1.2.
def term_frequency(document, term)
tf = document.term_count(term)
(tf * 2.2) / (tf + 0.3 + 0.9 * documents.size / @model.average_document_size)
if @model.average_document_size.zero?
Float::NAN
else
tf = document.term_count(term)
(tf * 2.2) / (tf + 0.3 + 0.9 * document.size / @model.average_document_size)
end
end
alias_method :tf, :term_frequency
end
Expand Down
14 changes: 7 additions & 7 deletions spec/bm25_model_spec.rb
Original file line number Diff line number Diff line change
Expand Up @@ -147,23 +147,23 @@ def similarity_matrix_values(model)
end

it 'should return the term frequency if tokens given' do
model.tf(document_with_tokens, 'foo-foo').should == (1 * 2.2) / (1 + 0.3 + 0.9 * 4 / 5.5)
model.tf(document_with_tokens, 'foo-foo').should == (1 * 2.2) / (1 + 0.3 + 0.9 * 3 / 5.5)
end

it 'should return no term frequency if no text given' do
model.tf(document_without_text, 'foo').should == 0
end

it 'should return the term frequency if term counts given' do
model.tf(document_with_term_counts, 'bar').should == (5 * 2.2) / (5 + 0.3 + 0.9 * 4 / 5.5)
model.tf(document_with_term_counts, 'bar').should == (5 * 2.2) / (5 + 0.3 + 0.9 * 15 / 5.5)
end

it 'should return the term frequency of a non-occurring term' do
model.tf(document, 'xxx').should == 0
end

it 'should return the term frequency in a non-occurring document' do
model.tf(non_corpus_document, 'foo').should == (3 * 2.2) / (3 + 0.3 + 0.9 * 4 / 5.5)
model.tf(non_corpus_document, 'foo').should == (3 * 2.2) / (3 + 0.3 + 0.9 * 3 / 5.5)
end
end

Expand All @@ -177,17 +177,17 @@ def similarity_matrix_values(model)
end

it 'should return the tf*idf in a non-occurring term' do
model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (3 * 2.2) / (3 + 0.3 + 0.9 * 4 / 5.5))
model.tfidf(non_corpus_document, 'foo').should be_within(0.001).of(Math.log((4 - 1 + 0.5) / (1 + 0.5)) * (3 * 2.2) / (3 + 0.3 + 0.9 * 3 / 5.5))
end
end

describe '#similarity_matrix' do
it 'should return the similarity matrix' do
expected = [
1.0, 0.564, 0.0, 0.479,
0.564, 1.0, 0.0, 0.540,
1.0, 0.558, 0.0, 0.449,
0.558, 1.0, 0.0, 0.501,
0.0, 0.0, 0.0, 0.0,
0.479, 0.540, 0.0, 1.0,
0.449, 0.501, 0.0, 1.0,
]

similarity_matrix_values(model).each_with_index do |value,i|
Expand Down

0 comments on commit ebe8431

Please sign in to comment.