diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..bb8dda6 --- /dev/null +++ b/.gitignore @@ -0,0 +1,6 @@ +*.gem +.bundle +.yardoc +Gemfile.lock +doc/* +pkg/* diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 0000000..11a115a --- /dev/null +++ b/.travis.yml @@ -0,0 +1,3 @@ +language: ruby +rvm: + - 1.9.3 diff --git a/Gemfile b/Gemfile new file mode 100644 index 0000000..22019ab --- /dev/null +++ b/Gemfile @@ -0,0 +1,4 @@ +source "http://rubygems.org" + +# Specify your gem's dependencies in scraperwiki-api.gemspec +gemspec diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..edb6c89 --- /dev/null +++ b/LICENSE @@ -0,0 +1,20 @@ +Copyright (c) 2012 Open North Inc. + +Permission is hereby granted, free of charge, to any person obtaining +a copy of this software and associated documentation files (the +"Software"), to deal in the Software without restriction, including +without limitation the rights to use, copy, modify, merge, publish, +distribute, sublicense, and/or sell copies of the Software, and to +permit persons to whom the Software is furnished to do so, subject to +the following conditions: + +The above copyright notice and this permission notice shall be +included in all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE +LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION +OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION +WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. \ No newline at end of file diff --git a/README.md b/README.md new file mode 100644 index 0000000..561afdf --- /dev/null +++ b/README.md @@ -0,0 +1,28 @@ +# Ruby Vector Space Model (VSM) with tf*idf weights + +For performance, use [Lucene](http://lucene.apache.org/core/), which implements other information retrieval functions, like [BM 25](http://en.wikipedia.org/wiki/Okapi_BM25). + +## Usage + + require 'tf-idf-similarity' + + +## Extras + +You can access more term frequency, document frequency, and normalization formulas with: + + require 'tf-idf-similarity/extras/collection' + require 'tf-idf-similarity/extras/document' + +The default tf*idf formula follows [Lucene](http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html). + +## Papers + +* [G. Salton and C. Buckley. "Term Weighting Approaches in Automatic Text Retrieval."" Technical Report. Cornell University, Ithaca, NY, USA. 1987.](http://www.cs.odu.edu/~jbollen/IR04/readings/article1-29-03.pdf) +* [E. Chisholm and T. G. Kolda. "New term weighting formulas for the vector space method in information retrieval." Technical Report Number ORNL-TM-13756. Oak Ridge National Laboratory, Oak Ridge, TN, USA. 1999.](http://www.sandia.gov/~tgkolda/pubs/bibtgkfiles/ornl-tm-13756.pdf) + +## Bugs? Questions? + +This gem's main repository is on GitHub: [http://github.com/opennorth/tf-idf-similarity](http://github.com/opennorth/tf-idf-similarity), where your contributions, forks, bug reports, feature requests, and feedback are greatly welcomed. + +Copyright (c) 2012 Open North Inc., released under the MIT license diff --git a/Rakefile b/Rakefile new file mode 100644 index 0000000..dec9518 --- /dev/null +++ b/Rakefile @@ -0,0 +1,16 @@ +require 'bundler' +Bundler::GemHelper.install_tasks + +require 'rspec/core/rake_task' +RSpec::Core::RakeTask.new(:spec) + +task :default => :spec + +begin + require 'yard' + YARD::Rake::YardocTask.new +rescue LoadError + task :yard do + abort 'YARD is not available. In order to run yard, you must: gem install yard' + end +end diff --git a/USAGE b/USAGE new file mode 100644 index 0000000..e0be16c --- /dev/null +++ b/USAGE @@ -0,0 +1 @@ +See README.md for full usage details. diff --git a/lib/tf-idf-similarity.rb b/lib/tf-idf-similarity.rb new file mode 100644 index 0000000..9db342a --- /dev/null +++ b/lib/tf-idf-similarity.rb @@ -0,0 +1,7 @@ +$LOAD_PATH.unshift(File.expand_path(File.dirname(__FILE__))) unless $LOAD_PATH.include?(File.expand_path(File.dirname(__FILE__))) + +module TfIdfSimilarity + autoload :Collection, 'tf-idf-similarity/collection' + autoload :Document, 'tf-idf-similarity/document' + autoload :Token, 'tf-idf-similarity/token' +end diff --git a/lib/tf-idf-similarity/collection.rb b/lib/tf-idf-similarity/collection.rb new file mode 100644 index 0000000..e707d5f --- /dev/null +++ b/lib/tf-idf-similarity/collection.rb @@ -0,0 +1,66 @@ +require 'matrix' + +class TfIdfSimilarity::Collection + # The documents in the collection. + attr_reader :documents + # The number of times each term appears in all documents. + attr_reader :term_counts + # The number of documents each term appears in. + attr_reader :document_counts + + def initialize + @documents = [] + @term_counts = Hash.new 0 + @document_counts = Hash.new 0 + end + + def <<(document) + document.term_counts.each do |term,count| + @term_counts[term] += count + @document_counts[term] += 1 + end + @documents << document + end + + # @return [Array] the set of the collection's terms with no duplicates + def terms + term_counts.keys + end + + # @note Use GSL or Linalg, or a package that implements sparse matrices, if + # Ruby's Matrix performance is too slow. + # + # @see http://en.wikipedia.org/wiki/Vector_space_model + # @see http://en.wikipedia.org/wiki/Document-term_matrix + # @see http://en.wikipedia.org/wiki/Cosine_similarity + def similarity_matrix + idf = [] + + term_document_matrix = Matrix.build(terms.size, documents.size) do |i,j| + idf[i] ||= inverse_document_frequency terms[i] + documents[j].term_frequency(terms[i]) * idf[i] + end + + # Columns are normalized to unit vectors, so we can calculate the cosine + # similarity of all document vectors. + matrix = normalize term_document_matrix + matrix.transpose * matrix + end + + # @param [String] term a term + # @return [Float] the term's inverse document frequency + # + # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html + def inverse_document_frequency(term) + 1 + Math.log2 documents.size / (document_counts(term).to_f + 1) + end + alias_method :idf, :inverse_document_frequency + + # @param [Document] matrix a term-document matrix + # @return [Matrix] a matrix in which all document vectors are unit vectors + # + # @note Lucene normalizes document length differently. + def normalize(matrix) + Matrix.columns tfidf.column_vectors.map(&:normalize) + end +end diff --git a/lib/tf-idf-similarity/document.rb b/lib/tf-idf-similarity/document.rb new file mode 100644 index 0000000..469fed8 --- /dev/null +++ b/lib/tf-idf-similarity/document.rb @@ -0,0 +1,74 @@ +# coding: utf-8 + +class TfIdfSimilarity::Document + # An optional document identifier. + attr_reader :id + # The document's text. + attr_reader :text + # The number of times each term appears in the document. + attr_reader :term_counts + # The maximum term count of any term in the document. + attr_reader :maximum_term_count + # The average term count of all terms in the document. + attr_reader :average_term_count + + # @param [String] text the document's text + # @param [Hash] opts optional arguments + # @option opts [String] :id a string to identify the document + def initialize(text, opts = {}) + @text = text + @id = opts[:id] || object_id + @term_counts = Hash.new 0 + process + end + + # @return [Array] the set of the document's terms with no duplicates + def terms + term_counts.keys + end + + # @param [String] term a term + # @return [Integer] the number of times the term appears in the document + def term_count(term) + term_counts[term] + end + + # @param [String] term a term + # @return [Float] the square root of the term count + # + # @see http://lucene.apache.org/core/4_0_0-BETA/core/org/apache/lucene/search/similarities/TFIDFSimilarity.html + def term_frequency(term) + Math.sqrt term_count(term) + end + alias_method :tf, :term_frequency + +private + + # Tokenize the text and counts terms. + def process + tokenize(text).each do |word| + token = Token.new word + if token.valid? + @term_counts[token.lowercase_filter.classic_filter.to_s] += 1 + end + end + + @maximum_term_count = @term_counts.values.max.to_f + @average_term_count = @term_counts.values.reduce(:+) / @term_counts.size.to_f + end + + # Tokenizes a text, respecting the word boundary rules from Unicode’s Default + # Word Boundary Specification. + # + # @param [String] text a text + # @return [Enumerator] a token enumerator + # + # @note We should evaluate the tokenizers by {http://www.sciencemag.org/content/suppl/2010/12/16/science.1199644.DC1/Michel.SOM.revision.2.pdf Google} + # or {http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.UAX29URLEmailTokenizerFactory Solr}. + # + # @see http://unicode.org/reports/tr29/#Default_Word_Boundaries + # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StandardTokenizerFactory + def tokenize(text) + UnicodeUtils.each_word text + end +end diff --git a/lib/tf-idf-similarity/extras/collection.rb b/lib/tf-idf-similarity/extras/collection.rb new file mode 100644 index 0000000..97f4600 --- /dev/null +++ b/lib/tf-idf-similarity/extras/collection.rb @@ -0,0 +1,83 @@ +class TfIdfSimilarity::Collection + # @note SMART n, Salton x, Chisholm NONE + def no_collection_frequency(term) + 1.0 + end + + # @note SMART t, Salton f, Chisholm IDFB + def plain_inverse_document_frequency(term) + count = document_counts(term).to_f + Math.log2 documents.size / count + end + alias_method :plain_idf, :plain_inverse_document_frequency + + # @note SMART p, Salton p, Chisholm IDFP + def probabilistic_inverse_document_frequency(term) + count = document_counts(term).to_f + Math.log2 (documents.size - count) / count + end + alias_method :probabilistic_idf, :probabilistic_inverse_document_frequency + + # @note Chisholm IGFF + def global_frequency_inverse_document_frequency(term) + term_counts[term] / document_counts(term).to_f + end + alias_method :gfidf, :global_frequency_inverse_document_frequency + + # @note Chisholm IGFL + def log_global_frequency_inverse_document_frequency(term) + Math.log global_frequency_inverse_document_frequency(term) + 1 + end + alias_method :log_gfidf, :log_global_frequency_inverse_document_frequency + + # @note Chisholm IGFI + def incremented_global_frequency_inverse_document_frequency(term) + global_frequency_inverse_document_frequency(term) + 1 + end + alias_method :incremented_gfidf, :incremented_global_frequency_inverse_document_frequency + + # @note Chisholm IGFS + def square_root_global_frequency_inverse_document_frequency(term) + Math.sqrt global_frequency_inverse_document_frequency(term) - 0.9 + end + alias_method :square_root_gfidf, :square_root_global_frequency_inverse_document_frequency + + # @note Chisholm ENPY + def entropy(term) + denominator = term_counts[term].to_f + logN = Math.log2 documents.size + 1 + documents.reduce(0) do |sum,document| + quotient = document.term_count(term) / denominator + sum += quotient * Math.log2(quotient) / logN + end + end + + + + # @param [Document] matrix a term-document matrix + # @return [Matrix] the same matrix + # + # @note SMART n, Salton x, Chisholm NONE + def no_normalization(matrix) + matrix + end + + # @param [Document] matrix a term-document matrix + # @return [Matrix] a matrix in which all document vectors are unit vectors + # + # @note SMART c, Salton c, Chisholm COSN + def cosine_normalization(matrix) + Matrix.columns(tfidf.column_vectors.map do |column| + column.normalize + end) + end + + # @param [Document] matrix a term-document matrix + # @return [Matrix] a matrix + # + # @note SMART u, Chisholm PUQN + def pivoted_unique_normalization(matrix) + # @todo + # http://nlp.stanford.edu/IR-book/html/htmledition/pivoted-normalized-document-length-1.html + end +end diff --git a/lib/tf-idf-similarity/extras/document.rb b/lib/tf-idf-similarity/extras/document.rb new file mode 100644 index 0000000..9904fe4 --- /dev/null +++ b/lib/tf-idf-similarity/extras/document.rb @@ -0,0 +1,110 @@ +class TfIdfSimilarity::Document + # Returns the term count. + # + # @note SMART n, Salton t, Chisholm FREQ + def plain_term_frequency(term) + term_count term + end + alias :plain_tf, :plain_term_frequency + + # Returns 1 if the term is present, 0 otherwise. + # + # @note SMART b, Salton b, Chisholm BNRY + def binary_term_frequency(term) + count = term_count term + if count > 0 + 1 + else + 0 + end + end + alias_method :binary_tf, :binary_term_frequency + + # Normalizes the term count by the maximum term count. + # + # @see http://en.wikipedia.org/wiki/Tf*idf + def normalized_term_frequency(term) + term_count(term) / maximum_term_count + end + alias_method :normalized_tf, :normalized_term_frequency + + # Further normalizes the normalized term frequency to lie between 0.5 and 1. + # + # @note SMART a, Salton n, Chisholm ATF1 + def augmented_normalized_term_frequency(term) + 0.5 + 0.5 * normalized_term_frequency(term) + end + alias_method :augmented_normalized_tf, :augmented_normalized_term_frequency + + # @note Chisholm ATFA + def augmented_average_term_frequency(term) + count = term_count term + if count > 0 + 0.9 + 0.1 * count / average_term_count + else + 0 + end + end + alias_method :augmented_average_tf, :augmented_average_term_frequency + + # @note Chisholm ATFC + def changed_coefficient_augmented_normalized_term_frequency(term) + count = term_count term + if count > 0 + 0.2 + 0.8 * count / maximum_term_count + else + 0 + end + end + alias_method :changed_coefficient_augmented_normalized_tf, :changed_coefficient_augmented_normalized_term_frequency + + # Dampen the term count using log. + # + # @note SMART l, Chisholm LOGA + def log_term_frequency(term) + count = term_count term + if count > 0 + 1 + Math.log2(count) + else + 0 + end + end + alias_method :log_tf, :log_term_frequency + + # Dampen and normalize the term count by the average term count. + # + # @note SMART L, Chisholm LOGN + def normalized_log_term_frequency(term) + count = term_count term + if count > 0 + (1 + Math.log2(count)) / (1 + Math.log2(average_term_count)) + else + 0 + end + end + alias_method :normalized_log_tf, :normalized_log_term_frequency + + # @note Chisholm LOGG + def augmented_log_term_frequency(term) + count = term_count term + if count > 0 + 0.2 + 0.8 * Math.log(count + 1) + else + 0 + end + end + alias_method :augmented_log_tf, :augmented_log_term_frequency + + # Dampen the term count using square root. + # + # @note Chisholm SQRT + def square_root_term_frequency(term) + count = term_count term + if count > 0 + Math.sqrt(count - 0.5) + 1 + else + 0 + end + end + alias_method :square_root_tf, :square_root_term_frequency +end diff --git a/lib/tf-idf-similarity/token.rb b/lib/tf-idf-similarity/token.rb new file mode 100644 index 0000000..6d0ed77 --- /dev/null +++ b/lib/tf-idf-similarity/token.rb @@ -0,0 +1,40 @@ +# @note We can add more filters from Solr and stem using Porter's Snowball. +# +# @see https://github.com/aurelian/ruby-stemmer +# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.StopFilterFactory +# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.WordDelimiterFilterFactory +# @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.SynonymFilterFactory +class TfIdfSimilarity::Token < String + # Returns a falsy value if all its characters are numbers, punctuation, + # whitespace or control characters. + # + # @note Some implementations ignore one and two-letter words. + # + # @return [Boolean] whether the string is a token + def valid? + token[%r{ + \A + ( + \d | # number + \p{Cntrl} | # control character + \p{Punct} | # punctuation + [[:space:]] # whitespace + )+ + \z + }x] + end + + # @return [String] a lowercase string + # + # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.LowerCaseFilterFactory + def lowercase_filter + UnicodeUtils.downcase self, :fr + end + + # @return [String] a string with no English possessive or periods in acronyms + # + # @see http://wiki.apache.org/solr/AnalyzersTokenizersTokenFilters#solr.ClassicFilterFactory + def classic_filter + self.gsub('.', '').chomp "'s" + end +end diff --git a/td-idf-similarity.gemspec b/td-idf-similarity.gemspec new file mode 100644 index 0000000..c8c3232 --- /dev/null +++ b/td-idf-similarity.gemspec @@ -0,0 +1,22 @@ +# -*- encoding: utf-8 -*- +$:.push File.expand_path("../lib", __FILE__) +require "tf-idf-similarity/version" + +Gem::Specification.new do |s| + s.name = "tf-idf-similarity" + s.version = TfIdfSimilarity::VERSION + s.platform = Gem::Platform::RUBY + s.authors = ["Open North"] + s.email = ["info@opennorth.ca"] + s.homepage = "http://github.com/opennorth/tf-idf-similarity" + s.summary = %q{Implements a Vector Space Model (VSM) with tf*idf weights} + + s.files = `git ls-files`.split("\n") + s.test_files = `git ls-files -- {test,spec,features}/*`.split("\n") + s.executables = `git ls-files -- bin/*`.split("\n").map{ |f| File.basename(f) } + s.require_paths = ["lib"] + + s.add_runtime_dependency('unicode_utils') + s.add_development_dependency('rspec', '~> 2.10') + s.add_development_dependency('rake') +end