first commit

jpmckinney · Sep 9, 2012 · 2fac30f · 2fac30f
commit 2fac30f
Show file tree

Hide file tree

Showing 14 changed files with 480 additions and 0 deletions.
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,6 @@
+*.gem
+.bundle
+.yardoc
+Gemfile.lock
+doc/*
+pkg/*
diff --git a/.travis.yml b/.travis.yml
@@ -0,0 +1,3 @@
+language: ruby
+rvm:
+  - 1.9.3
diff --git a/Gemfile b/Gemfile
@@ -0,0 +1,4 @@
+source "http://rubygems.org"
+
+# Specify your gem's dependencies in scraperwiki-api.gemspec
+gemspec
diff --git a/LICENSE b/LICENSE
@@ -0,0 +1,20 @@
+Copyright (c) 2012 Open North Inc.
+
+Permission is hereby granted, free of charge, to any person obtaining
+a copy of this software and associated documentation files (the
+"Software"), to deal in the Software without restriction, including
+without limitation the rights to use, copy, modify, merge, publish,
+distribute, sublicense, and/or sell copies of the Software, and to
+permit persons to whom the Software is furnished to do so, subject to
+the following conditions:
+
+The above copyright notice and this permission notice shall be
+included in all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
diff --git a/README.md b/README.md
@@ -0,0 +1,28 @@
+# Ruby Vector Space Model (VSM) with tf*idf weights
+
+For performance, use [Lucene](http://lucene.apache.org/core/), which implements other information retrieval functions, like [BM 25](http://en.wikipedia.org/wiki/Okapi_BM25).
+
+## Usage
+
+    require 'tf-idf-similarity'
+
+
+## Extras
+
+You can access more term frequency, document frequency, and normalization formulas with:
+
+    require 'tf-idf-similarity/extras/collection'
+    require 'tf-idf-similarity/extras/document'
+
+The default tf*idf formula follows [Lucene](http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html).
+
+## Papers
+
+* [G. Salton and C. Buckley. "Term Weighting Approaches in Automatic Text Retrieval."" Technical Report. Cornell University, Ithaca, NY, USA. 1987.](http://www.cs.odu.edu/~jbollen/IR04/readings/article1-29-03.pdf)
+* [E. Chisholm and T. G. Kolda. "New term weighting formulas for the vector space method in information retrieval." Technical Report Number ORNL-TM-13756. Oak Ridge National Laboratory, Oak Ridge, TN, USA. 1999.](http://www.sandia.gov/~tgkolda/pubs/bibtgkfiles/ornl-tm-13756.pdf)
+
+## Bugs? Questions?
+
+This gem's main repository is on GitHub: [http://github.com/opennorth/tf-idf-similarity](http://github.com/opennorth/tf-idf-similarity), where your contributions, forks, bug reports, feature requests, and feedback are greatly welcomed.
+
+Copyright (c) 2012 Open North Inc., released under the MIT license
diff --git a/Rakefile b/Rakefile
@@ -0,0 +1,16 @@
+require 'bundler'
+Bundler::GemHelper.install_tasks
+
+require 'rspec/core/rake_task'
+RSpec::Core::RakeTask.new(:spec)
+
+task :default => :spec
+
+begin
+  require 'yard'
+  YARD::Rake::YardocTask.new
+rescue LoadError
+  task :yard do
+    abort 'YARD is not available. In order to run yard, you must: gem install yard'
+  end
+end
diff --git a/USAGE b/USAGE
@@ -0,0 +1 @@
+See README.md for full usage details.
diff --git a/lib/tf-idf-similarity.rb b/lib/tf-idf-similarity.rb
@@ -0,0 +1,7 @@
+$LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__))) unless $LOAD_PATH.include?(File.expand_path(File.dirname(__FILE__)))
+
+module TfIdfSimilarity
+  autoload :Collection, 'tf-idf-similarity/collection'
+  autoload :Document, 'tf-idf-similarity/document'
+  autoload :Token, 'tf-idf-similarity/token'
+end
diff --git a/lib/tf-idf-similarity/collection.rb b/lib/tf-idf-similarity/collection.rb
@@ -0,0 +1,66 @@
+require 'matrix'
+
+class TfIdfSimilarity::Collection
+  # The documents in the collection.
+  attr_reader :documents
+  # The number of times each term appears in all documents.
+  attr_reader :term_counts
+  # The number of documents each term appears in.
+  attr_reader :document_counts
+
+  def initialize
+    @documents       = []
+    @term_counts     = Hash.new 0
+    @document_counts = Hash.new 0
+  end
+
+  def <<(document)
+    document.term_counts.each do |term,count|
+      @term_counts[term]     += count
+      @document_counts[term] += 1
+    end
+    @documents << document
+  end
+
+  # @return [Array<String>] the set of the collection's terms with no duplicates
+  def terms
+    term_counts.keys
+  end
+
+  # @note Use GSL or Linalg, or a package that implements sparse matrices, if
+  #   Ruby's Matrix performance is too slow.
+  #
+  # @see http://en.wikipedia.org/wiki/Vector_space_model
+  # @see http://en.wikipedia.org/wiki/Document-term_matrix
+  # @see http://en.wikipedia.org/wiki/Cosine_similarity
+  def similarity_matrix
+    idf = []
+
+    term_document_matrix = Matrix.build(terms.size, documents.size) do |i,j|
+      idf[i] ||= inverse_document_frequency terms[i]
+      documents[j].term_frequency(terms[i]) * idf[i]
+    end
+
+    # Columns are normalized to unit vectors, so we can calculate the cosine
+    # similarity of all document vectors.
+    matrix = normalize term_document_matrix
+    matrix.transpose * matrix
+  end
+
+  # @param [String] term a term
+  # @return [Float] the term's inverse document frequency
+  #
+  # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html
+  def inverse_document_frequency(term)
+    1 + Math.log2 documents.size / (document_counts(term).to_f + 1)
+  end
+  alias_method :idf, :inverse_document_frequency
+
+  # @param [Document] matrix a term-document matrix
+  # @return [Matrix] a matrix in which all document vectors are unit vectors
+  #
+  # @note Lucene normalizes document length differently.
+  def normalize(matrix)
+    Matrix.columns tfidf.column_vectors.map(&:normalize)
+  end
+end
diff --git a/lib/tf-idf-similarity/document.rb b/lib/tf-idf-similarity/document.rb
@@ -0,0 +1,74 @@
+# coding: utf-8
+
+class TfIdfSimilarity::Document
+  # An optional document identifier.
+  attr_reader :id
+  # The document's text.
+  attr_reader :text
+  # The number of times each term appears in the document.
+  attr_reader :term_counts
+  # The maximum term count of any term in the document.
+  attr_reader :maximum_term_count
+  # The average term count of all terms in the document.
+  attr_reader :average_term_count
+
+  # @param [String] text the document's text
+  # @param [Hash] opts optional arguments
+  # @option opts [String] :id a string to identify the document
+  def initialize(text, opts = {})
+    @text        = text
+    @id          = opts[:id] || object_id
+    @term_counts = Hash.new 0
+    process
+  end
+
+  # @return [Array<String>] the set of the document's terms with no duplicates
+  def terms
+    term_counts.keys
+  end
+
+  # @param [String] term a term
+  # @return [Integer] the number of times the term appears in the document
+  def term_count(term)
+    term_counts[term]
+  end
+
+  # @param [String] term a term
+  # @return [Float] the square root of the term count
+  #
+  # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html
+  def term_frequency(term)
+    Math.sqrt term_count(term)
+  end
+  alias_method :tf, :term_frequency
+
+private
+
+  # Tokenize the text and counts terms.
+  def process
+    tokenize(text).each do |word|
+      token = Token.new word
+      if token.valid?
+        @term_counts[token.lowercase_filter.classic_filter.to_s] += 1
+      end
+    end
+
+    @maximum_term_count = @term_counts.values.max.to_f
+    @average_term_count = @term_counts.values.reduce(:+) / @term_counts.size.to_f
+  end
+
+  # Tokenizes a text, respecting the word boundary rules from Unicode’s Default
+  # Word Boundary Specification.
+  #
+  # @param [String] text a text
+  # @return [Enumerator] a token enumerator
+  #
+  # @note We should evaluate the tokenizers by {http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf Google}
+  #   or {http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.UAX29URLEmailTokenizerFactory Solr}.
+  #
+  # @see http://unicode.org/reports/tr29/#Default_Word_Boundaries
+  # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory
+  def tokenize(text)
+    UnicodeUtils.each_word text
+  end
+end
diff --git a/lib/tf-idf-similarity/extras/collection.rb b/lib/tf-idf-similarity/extras/collection.rb
@@ -0,0 +1,83 @@
+class TfIdfSimilarity::Collection
+  # @note SMART n, Salton x, Chisholm NONE
+  def no_collection_frequency(term)
+    1.0
+  end
+
+  # @note SMART t, Salton f, Chisholm IDFB
+  def plain_inverse_document_frequency(term)
+    count = document_counts(term).to_f
+    Math.log2 documents.size / count
+  end
+  alias_method :plain_idf, :plain_inverse_document_frequency
+
+  # @note SMART p, Salton p, Chisholm IDFP
+  def probabilistic_inverse_document_frequency(term)
+    count = document_counts(term).to_f
+    Math.log2 (documents.size - count) / count
+  end
+  alias_method :probabilistic_idf, :probabilistic_inverse_document_frequency
+
+  # @note Chisholm IGFF
+  def global_frequency_inverse_document_frequency(term)
+    term_counts[term] / document_counts(term).to_f
+  end
+  alias_method :gfidf, :global_frequency_inverse_document_frequency
+
+  # @note Chisholm IGFL
+  def log_global_frequency_inverse_document_frequency(term)
+    Math.log global_frequency_inverse_document_frequency(term) + 1
+  end
+  alias_method :log_gfidf, :log_global_frequency_inverse_document_frequency
+
+  # @note Chisholm IGFI
+  def incremented_global_frequency_inverse_document_frequency(term)
+    global_frequency_inverse_document_frequency(term) + 1
+  end
+  alias_method :incremented_gfidf, :incremented_global_frequency_inverse_document_frequency
+
+  # @note Chisholm IGFS
+  def square_root_global_frequency_inverse_document_frequency(term)
+    Math.sqrt global_frequency_inverse_document_frequency(term) - 0.9
+  end
+  alias_method :square_root_gfidf, :square_root_global_frequency_inverse_document_frequency
+
+  # @note Chisholm ENPY
+  def entropy(term)
+    denominator = term_counts[term].to_f
+    logN = Math.log2 documents.size
+    1 + documents.reduce(0) do |sum,document|
+      quotient = document.term_count(term) / denominator
+      sum += quotient * Math.log2(quotient) / logN
+    end
+  end
+
+
+
+  # @param [Document] matrix a term-document matrix
+  # @return [Matrix] the same matrix
+  #
+  # @note SMART n, Salton x, Chisholm NONE
+  def no_normalization(matrix)
+    matrix
+  end
+
+  # @param [Document] matrix a term-document matrix
+  # @return [Matrix] a matrix in which all document vectors are unit vectors
+  #
+  # @note SMART c, Salton c, Chisholm COSN
+  def cosine_normalization(matrix)
+    Matrix.columns(tfidf.column_vectors.map do |column|
+      column.normalize
+    end)
+  end
+
+  # @param [Document] matrix a term-document matrix
+  # @return [Matrix] a matrix
+  #
+  # @note SMART u, Chisholm PUQN
+  def pivoted_unique_normalization(matrix)
+    # @todo
+    # http://nlp.stanford.edu/IR-book/html/htmledition/pivoted-normalized-document-length-1.html
+  end
+end