From 2fac30f17b874d18186864a62e1453f77b7f01c5 Mon Sep 17 00:00:00 2001 From: James McKinney Date: Sun, 9 Sep 2012 12:03:38 -0400 Subject: [PATCH] first commit --- .gitignore | 6 ++ .travis.yml | 3 + Gemfile | 4 + LICENSE | 20 ++++ README.md | 28 ++++++ Rakefile | 16 +++ USAGE | 1 + lib/tf-idf-similarity.rb | 7 ++ lib/tf-idf-similarity/collection.rb | 66 +++++++++++++ lib/tf-idf-similarity/document.rb | 74 ++++++++++++++ lib/tf-idf-similarity/extras/collection.rb | 83 ++++++++++++++++ lib/tf-idf-similarity/extras/document.rb | 110 +++++++++++++++++++++ lib/tf-idf-similarity/token.rb | 40 ++++++++ td-idf-similarity.gemspec | 22 +++++ 14 files changed, 480 insertions(+) create mode 100644 .gitignore create mode 100644 .travis.yml create mode 100644 Gemfile create mode 100644 LICENSE create mode 100644 README.md create mode 100644 Rakefile create mode 100644 USAGE create mode 100644 lib/tf-idf-similarity.rb create mode 100644 lib/tf-idf-similarity/collection.rb create mode 100644 lib/tf-idf-similarity/document.rb create mode 100644 lib/tf-idf-similarity/extras/collection.rb create mode 100644 lib/tf-idf-similarity/extras/document.rb create mode 100644 lib/tf-idf-similarity/token.rb create mode 100644 td-idf-similarity.gemspec diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bb8dda6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +*.gem +.bundle +.yardoc +Gemfile.lock +doc/* +pkg/* diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..11a115a --- /dev/null +++ b/.travis.yml @@ -0,0 +1,3 @@ +language: ruby +rvm: + - 1.9.3 diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..22019ab --- /dev/null +++ b/Gemfile @@ -0,0 +1,4 @@ +source "http://rubygems.org" + +# Specify your gem's dependencies in scraperwiki-api.gemspec +gemspec diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..edb6c89 --- /dev/null +++ b/LICENSE @@ -0,0 +1,20 @@ +Copyright (c) 2012 Open North Inc. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..561afdf --- /dev/null +++ b/README.md @@ -0,0 +1,28 @@ +# Ruby Vector Space Model (VSM) with tf*idf weights + +For performance, use [Lucene](http://lucene.apache.org/core/), which implements other information retrieval functions, like [BM 25](http://en.wikipedia.org/wiki/Okapi_BM25). + +## Usage + + require 'tf-idf-similarity' + + +## Extras + +You can access more term frequency, document frequency, and normalization formulas with: + + require 'tf-idf-similarity/extras/collection' + require 'tf-idf-similarity/extras/document' + +The default tf*idf formula follows [Lucene](http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html). + +## Papers + +* [G. Salton and C. Buckley. "Term Weighting Approaches in Automatic Text Retrieval."" Technical Report. Cornell University, Ithaca, NY, USA. 1987.](http://www.cs.odu.edu/~jbollen/IR04/readings/article1-29-03.pdf) +* [E. Chisholm and T. G. Kolda. "New term weighting formulas for the vector space method in information retrieval." Technical Report Number ORNL-TM-13756. Oak Ridge National Laboratory, Oak Ridge, TN, USA. 1999.](http://www.sandia.gov/~tgkolda/pubs/bibtgkfiles/ornl-tm-13756.pdf) + +## Bugs? Questions? + +This gem's main repository is on GitHub: [http://github.com/opennorth/tf-idf-similarity](http://github.com/opennorth/tf-idf-similarity), where your contributions, forks, bug reports, feature requests, and feedback are greatly welcomed. + +Copyright (c) 2012 Open North Inc., released under the MIT license diff --git a/Rakefile b/Rakefile new file mode 100644 index 0000000..dec9518 --- /dev/null +++ b/Rakefile @@ -0,0 +1,16 @@ +require 'bundler' +Bundler::GemHelper.install_tasks + +require 'rspec/core/rake_task' +RSpec::Core::RakeTask.new(:spec) + +task :default => :spec + +begin + require 'yard' + YARD::Rake::YardocTask.new +rescue LoadError + task :yard do + abort 'YARD is not available. In order to run yard, you must: gem install yard' + end +end diff --git a/USAGE b/USAGE new file mode 100644 index 0000000..e0be16c --- /dev/null +++ b/USAGE @@ -0,0 +1 @@ +See README.md for full usage details. diff --git a/lib/tf-idf-similarity.rb b/lib/tf-idf-similarity.rb new file mode 100644 index 0000000..9db342a --- /dev/null +++ b/lib/tf-idf-similarity.rb @@ -0,0 +1,7 @@ +$LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__))) unless $LOAD_PATH.include?(File.expand_path(File.dirname(__FILE__))) + +module TfIdfSimilarity + autoload :Collection, 'tf-idf-similarity/collection' + autoload :Document, 'tf-idf-similarity/document' + autoload :Token, 'tf-idf-similarity/token' +end diff --git a/lib/tf-idf-similarity/collection.rb b/lib/tf-idf-similarity/collection.rb new file mode 100644 index 0000000..e707d5f --- /dev/null +++ b/lib/tf-idf-similarity/collection.rb @@ -0,0 +1,66 @@ +require 'matrix' + +class TfIdfSimilarity::Collection + # The documents in the collection. + attr_reader :documents + # The number of times each term appears in all documents. + attr_reader :term_counts + # The number of documents each term appears in. + attr_reader :document_counts + + def initialize + @documents = [] + @term_counts = Hash.new 0 + @document_counts = Hash.new 0 + end + + def <<(document) + document.term_counts.each do |term,count| + @term_counts[term] += count + @document_counts[term] += 1 + end + @documents << document + end + + # @return [Array] the set of the collection's terms with no duplicates + def terms + term_counts.keys + end + + # @note Use GSL or Linalg, or a package that implements sparse matrices, if + # Ruby's Matrix performance is too slow. + # + # @see http://en.wikipedia.org/wiki/Vector_space_model + # @see http://en.wikipedia.org/wiki/Document-term_matrix + # @see http://en.wikipedia.org/wiki/Cosine_similarity + def similarity_matrix + idf = [] + + term_document_matrix = Matrix.build(terms.size, documents.size) do |i,j| + idf[i] ||= inverse_document_frequency terms[i] + documents[j].term_frequency(terms[i]) * idf[i] + end + + # Columns are normalized to unit vectors, so we can calculate the cosine + # similarity of all document vectors. + matrix = normalize term_document_matrix + matrix.transpose * matrix + end + + # @param [String] term a term + # @return [Float] the term's inverse document frequency + # + # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html + def inverse_document_frequency(term) + 1 + Math.log2 documents.size / (document_counts(term).to_f + 1) + end + alias_method :idf, :inverse_document_frequency + + # @param [Document] matrix a term-document matrix + # @return [Matrix] a matrix in which all document vectors are unit vectors + # + # @note Lucene normalizes document length differently. + def normalize(matrix) + Matrix.columns tfidf.column_vectors.map(&:normalize) + end +end diff --git a/lib/tf-idf-similarity/document.rb b/lib/tf-idf-similarity/document.rb new file mode 100644 index 0000000..469fed8 --- /dev/null +++ b/lib/tf-idf-similarity/document.rb @@ -0,0 +1,74 @@ +# coding: utf-8 + +class TfIdfSimilarity::Document + # An optional document identifier. + attr_reader :id + # The document's text. + attr_reader :text + # The number of times each term appears in the document. + attr_reader :term_counts + # The maximum term count of any term in the document. + attr_reader :maximum_term_count + # The average term count of all terms in the document. + attr_reader :average_term_count + + # @param [String] text the document's text + # @param [Hash] opts optional arguments + # @option opts [String] :id a string to identify the document + def initialize(text, opts = {}) + @text = text + @id = opts[:id] || object_id + @term_counts = Hash.new 0 + process + end + + # @return [Array] the set of the document's terms with no duplicates + def terms + term_counts.keys + end + + # @param [String] term a term + # @return [Integer] the number of times the term appears in the document + def term_count(term) + term_counts[term] + end + + # @param [String] term a term + # @return [Float] the square root of the term count + # + # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html + def term_frequency(term) + Math.sqrt term_count(term) + end + alias_method :tf, :term_frequency + +private + + # Tokenize the text and counts terms. + def process + tokenize(text).each do |word| + token = Token.new word + if token.valid? + @term_counts[token.lowercase_filter.classic_filter.to_s] += 1 + end + end + + @maximum_term_count = @term_counts.values.max.to_f + @average_term_count = @term_counts.values.reduce(:+) / @term_counts.size.to_f + end + + # Tokenizes a text, respecting the word boundary rules from Unicode’s Default + # Word Boundary Specification. + # + # @param [String] text a text + # @return [Enumerator] a token enumerator + # + # @note We should evaluate the tokenizers by {http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf Google} + # or {http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.UAX29URLEmailTokenizerFactory Solr}. + # + # @see http://unicode.org/reports/tr29/#Default_Word_Boundaries + # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory + def tokenize(text) + UnicodeUtils.each_word text + end +end diff --git a/lib/tf-idf-similarity/extras/collection.rb b/lib/tf-idf-similarity/extras/collection.rb new file mode 100644 index 0000000..97f4600 --- /dev/null +++ b/lib/tf-idf-similarity/extras/collection.rb @@ -0,0 +1,83 @@ +class TfIdfSimilarity::Collection + # @note SMART n, Salton x, Chisholm NONE + def no_collection_frequency(term) + 1.0 + end + + # @note SMART t, Salton f, Chisholm IDFB + def plain_inverse_document_frequency(term) + count = document_counts(term).to_f + Math.log2 documents.size / count + end + alias_method :plain_idf, :plain_inverse_document_frequency + + # @note SMART p, Salton p, Chisholm IDFP + def probabilistic_inverse_document_frequency(term) + count = document_counts(term).to_f + Math.log2 (documents.size - count) / count + end + alias_method :probabilistic_idf, :probabilistic_inverse_document_frequency + + # @note Chisholm IGFF + def global_frequency_inverse_document_frequency(term) + term_counts[term] / document_counts(term).to_f + end + alias_method :gfidf, :global_frequency_inverse_document_frequency + + # @note Chisholm IGFL + def log_global_frequency_inverse_document_frequency(term) + Math.log global_frequency_inverse_document_frequency(term) + 1 + end + alias_method :log_gfidf, :log_global_frequency_inverse_document_frequency + + # @note Chisholm IGFI + def incremented_global_frequency_inverse_document_frequency(term) + global_frequency_inverse_document_frequency(term) + 1 + end + alias_method :incremented_gfidf, :incremented_global_frequency_inverse_document_frequency + + # @note Chisholm IGFS + def square_root_global_frequency_inverse_document_frequency(term) + Math.sqrt global_frequency_inverse_document_frequency(term) - 0.9 + end + alias_method :square_root_gfidf, :square_root_global_frequency_inverse_document_frequency + + # @note Chisholm ENPY + def entropy(term) + denominator = term_counts[term].to_f + logN = Math.log2 documents.size + 1 + documents.reduce(0) do |sum,document| + quotient = document.term_count(term) / denominator + sum += quotient * Math.log2(quotient) / logN + end + end + + + + # @param [Document] matrix a term-document matrix + # @return [Matrix] the same matrix + # + # @note SMART n, Salton x, Chisholm NONE + def no_normalization(matrix) + matrix + end + + # @param [Document] matrix a term-document matrix + # @return [Matrix] a matrix in which all document vectors are unit vectors + # + # @note SMART c, Salton c, Chisholm COSN + def cosine_normalization(matrix) + Matrix.columns(tfidf.column_vectors.map do |column| + column.normalize + end) + end + + # @param [Document] matrix a term-document matrix + # @return [Matrix] a matrix + # + # @note SMART u, Chisholm PUQN + def pivoted_unique_normalization(matrix) + # @todo + # http://nlp.stanford.edu/IR-book/html/htmledition/pivoted-normalized-document-length-1.html + end +end diff --git a/lib/tf-idf-similarity/extras/document.rb b/lib/tf-idf-similarity/extras/document.rb new file mode 100644 index 0000000..9904fe4 --- /dev/null +++ b/lib/tf-idf-similarity/extras/document.rb @@ -0,0 +1,110 @@ +class TfIdfSimilarity::Document + # Returns the term count. + # + # @note SMART n, Salton t, Chisholm FREQ + def plain_term_frequency(term) + term_count term + end + alias :plain_tf, :plain_term_frequency + + # Returns 1 if the term is present, 0 otherwise. + # + # @note SMART b, Salton b, Chisholm BNRY + def binary_term_frequency(term) + count = term_count term + if count > 0 + 1 + else + 0 + end + end + alias_method :binary_tf, :binary_term_frequency + + # Normalizes the term count by the maximum term count. + # + # @see http://en.wikipedia.org/wiki/Tf*idf + def normalized_term_frequency(term) + term_count(term) / maximum_term_count + end + alias_method :normalized_tf, :normalized_term_frequency + + # Further normalizes the normalized term frequency to lie between 0.5 and 1. + # + # @note SMART a, Salton n, Chisholm ATF1 + def augmented_normalized_term_frequency(term) + 0.5 + 0.5 * normalized_term_frequency(term) + end + alias_method :augmented_normalized_tf, :augmented_normalized_term_frequency + + # @note Chisholm ATFA + def augmented_average_term_frequency(term) + count = term_count term + if count > 0 + 0.9 + 0.1 * count / average_term_count + else + 0 + end + end + alias_method :augmented_average_tf, :augmented_average_term_frequency + + # @note Chisholm ATFC + def changed_coefficient_augmented_normalized_term_frequency(term) + count = term_count term + if count > 0 + 0.2 + 0.8 * count / maximum_term_count + else + 0 + end + end + alias_method :changed_coefficient_augmented_normalized_tf, :changed_coefficient_augmented_normalized_term_frequency + + # Dampen the term count using log. + # + # @note SMART l, Chisholm LOGA + def log_term_frequency(term) + count = term_count term + if count > 0 + 1 + Math.log2(count) + else + 0 + end + end + alias_method :log_tf, :log_term_frequency + + # Dampen and normalize the term count by the average term count. + # + # @note SMART L, Chisholm LOGN + def normalized_log_term_frequency(term) + count = term_count term + if count > 0 + (1 + Math.log2(count)) / (1 + Math.log2(average_term_count)) + else + 0 + end + end + alias_method :normalized_log_tf, :normalized_log_term_frequency + + # @note Chisholm LOGG + def augmented_log_term_frequency(term) + count = term_count term + if count > 0 + 0.2 + 0.8 * Math.log(count + 1) + else + 0 + end + end + alias_method :augmented_log_tf, :augmented_log_term_frequency + + # Dampen the term count using square root. + # + # @note Chisholm SQRT + def square_root_term_frequency(term) + count = term_count term + if count > 0 + Math.sqrt(count - 0.5) + 1 + else + 0 + end + end + alias_method :square_root_tf, :square_root_term_frequency +end diff --git a/lib/tf-idf-similarity/token.rb b/lib/tf-idf-similarity/token.rb new file mode 100644 index 0000000..6d0ed77 --- /dev/null +++ b/lib/tf-idf-similarity/token.rb @@ -0,0 +1,40 @@ +# @note We can add more filters from Solr and stem using Porter's Snowball. +# +# @see https://github.com/aurelian/ruby-stemmer +# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StopFilterFactory +# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.WordDelimiterFilterFactory +# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.SynonymFilterFactory +class TfIdfSimilarity::Token < String + # Returns a falsy value if all its characters are numbers, punctuation, + # whitespace or control characters. + # + # @note Some implementations ignore one and two-letter words. + # + # @return [Boolean] whether the string is a token + def valid? + token[%r{ + \A + ( + \d | # number + \p{Cntrl} | # control character + \p{Punct} | # punctuation + [[:space:]] # whitespace + )+ + \z + }x] + end + + # @return [String] a lowercase string + # + # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.LowerCaseFilterFactory + def lowercase_filter + UnicodeUtils.downcase self, :fr + end + + # @return [String] a string with no English possessive or periods in acronyms + # + # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory + def classic_filter + self.gsub('.', '').chomp "'s" + end +end diff --git a/td-idf-similarity.gemspec b/td-idf-similarity.gemspec new file mode 100644 index 0000000..c8c3232 --- /dev/null +++ b/td-idf-similarity.gemspec @@ -0,0 +1,22 @@ +# -*- encoding: utf-8 -*- +$:.push File.expand_path("../lib", __FILE__) +require "tf-idf-similarity/version" + +Gem::Specification.new do |s| + s.name = "tf-idf-similarity" + s.version = TfIdfSimilarity::VERSION + s.platform = Gem::Platform::RUBY + s.authors = ["Open North"] + s.email = ["info@opennorth.ca"] + s.homepage = "http://github.com/opennorth/tf-idf-similarity" + s.summary = %q{Implements a Vector Space Model (VSM) with tf*idf weights} + + s.files = `git ls-files`.split("\n") + s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n") + s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) } + s.require_paths = ["lib"] + + s.add_runtime_dependency('unicode_utils') + s.add_development_dependency('rspec', '~> 2.10') + s.add_development_dependency('rake') +end